From 17b647dfe58da2453ae345bfd0a75e66055e5591 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 18 Sep 2023 09:59:38 +0200 Subject: [PATCH 001/127] Wip --- Cargo.lock | 1 + milli/Cargo.toml | 1 + milli/src/search/new/tests/sort.rs | 1 + .../extract/extract_docid_word_positions.rs | 19 +-- .../extract/extract_fid_word_count_docids.rs | 67 ++------ .../extract/extract_word_docids.rs | 138 +++++++++++++---- .../extract/extract_word_fid_docids.rs | 2 + .../extract_word_pair_proximity_docids.rs | 145 ++++++++---------- .../extract/extract_word_position_docids.rs | 13 +- .../src/update/index_documents/extract/mod.rs | 33 ++-- .../index_documents/helpers/grenad_helpers.rs | 26 ++++ milli/src/update/index_documents/mod.rs | 14 +- .../src/update/index_documents/typed_chunk.rs | 28 +++- 13 files changed, 288 insertions(+), 200 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b3991d130..d8cd12cc2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2703,6 +2703,7 @@ dependencies = [ "logging_timer", "maplit", "md5", + "meili-snap", "memmap2", "mimalloc", "obkv", diff --git a/milli/Cargo.toml b/milli/Cargo.toml index b19b40e85..68bc2d2b5 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -79,6 +79,7 @@ big_s = "1.0.2" insta = "1.29.0" maplit = "1.0.2" md5 = "0.7.0" +meili-snap = { path = "../meili-snap" } rand = { version = "0.8.5", features = ["small_rng"] } [features] diff --git a/milli/src/search/new/tests/sort.rs b/milli/src/search/new/tests/sort.rs index aa6aa971f..8fdf52d44 100644 --- a/milli/src/search/new/tests/sort.rs +++ b/milli/src/search/new/tests/sort.rs @@ -13,6 +13,7 @@ This module tests the `sort` ranking rule: use big_s::S; use maplit::hashset; +use meili_snap::insta; use crate::index::tests::TempIndex; use crate::search::new::tests::collect_field_values; diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 643d16354..6aa66c92a 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -5,11 +5,11 @@ use std::io::BufReader; use std::{io, mem, str}; use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder}; -use obkv::KvReader; +use obkv::{KvReader, KvWriterU16}; use roaring::RoaringBitmap; use serde_json::Value; -use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters}; +use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters}; use crate::error::{InternalError, SerializationError}; use crate::update::index_documents::MergeFn; use crate::{ @@ -43,7 +43,7 @@ pub fn extract_docid_word_positions( let mut script_language_docids = HashMap::new(); let mut docid_word_positions_sorter = create_sorter( grenad::SortAlgorithm::Stable, - concat_u32s_array, + keep_latest_obkv, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, @@ -156,6 +156,7 @@ fn extract_tokens_from_document( let tokens = process_tokens(tokenizer.tokenize(field)) .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); + let mut writer = KvWriterU16::memory(); for (index, token) in tokens { // if a language has been detected for the token, we update the counter. if let Some(language) = token.language { @@ -169,17 +170,17 @@ fn extract_tokens_from_document( } let token = token.lemma().trim(); if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { - buffers.key_buffer.truncate(mem::size_of::()); - buffers.key_buffer.extend_from_slice(token.as_bytes()); - let position: u16 = index .try_into() .map_err(|_| SerializationError::InvalidNumberSerialization)?; - let position = absolute_from_relative_position(field_id, position); - docid_word_positions_sorter - .insert(&buffers.key_buffer, position.to_ne_bytes())?; + writer.insert(position, token.as_bytes())?; } } + + let positions = writer.into_inner()?; + buffers.key_buffer.truncate(mem::size_of::()); + buffers.key_buffer.extend_from_slice(&field_id.to_be_bytes()); + docid_word_positions_sorter.insert(&buffers.key_buffer, positions)?; } } } diff --git a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs index 92564b4cd..289a744da 100644 --- a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs +++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs @@ -1,16 +1,17 @@ -use std::collections::HashMap; use std::fs::File; use std::io::{self, BufReader}; -use grenad::Sorter; +use obkv::KvReaderU16; use super::helpers::{ - create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, - try_split_array_at, GrenadParameters, MergeFn, + create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at, + GrenadParameters, }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; -use crate::{relative_from_absolute_position, DocumentId, FieldId, Result}; +use crate::Result; + +const MAX_COUNTED_WORDS: usize = 30; /// Extracts the field id word count and the documents ids where /// this field id with this amount of words appear. @@ -35,63 +36,21 @@ pub fn extract_fid_word_count_docids( max_memory, ); - // This map is assumed to not consume a lot of memory. - let mut document_fid_wordcount = HashMap::new(); - let mut current_document_id = None; - + let mut key_buffer = Vec::new(); let mut cursor = docid_word_positions.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { - let (document_id_bytes, _word_bytes) = try_split_array_at(key) + let (document_id_bytes, fid_bytes) = try_split_array_at(key) .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = u32::from_be_bytes(document_id_bytes); - let curr_document_id = *current_document_id.get_or_insert(document_id); - if curr_document_id != document_id { - drain_document_fid_wordcount_into_sorter( - &mut fid_word_count_docids_sorter, - &mut document_fid_wordcount, - curr_document_id, - )?; - current_document_id = Some(document_id); - } - - for position in read_u32_ne_bytes(value) { - let (field_id, _) = relative_from_absolute_position(position); - - let value = document_fid_wordcount.entry(field_id as FieldId).or_insert(0); - *value += 1; - } - } - - if let Some(document_id) = current_document_id { - // We must make sure that don't lose the current document field id - // word count map if we break because we reached the end of the chunk. - drain_document_fid_wordcount_into_sorter( - &mut fid_word_count_docids_sorter, - &mut document_fid_wordcount, - document_id, - )?; - } - - sorter_into_reader(fid_word_count_docids_sorter, indexer) -} - -fn drain_document_fid_wordcount_into_sorter( - fid_word_count_docids_sorter: &mut Sorter, - document_fid_wordcount: &mut HashMap, - document_id: DocumentId, -) -> Result<()> { - let mut key_buffer = Vec::new(); - - for (fid, count) in document_fid_wordcount.drain() { - if count <= 30 { + let word_count = KvReaderU16::new(&value).iter().take(MAX_COUNTED_WORDS + 1).count(); + if word_count <= MAX_COUNTED_WORDS { key_buffer.clear(); - key_buffer.extend_from_slice(&fid.to_be_bytes()); - key_buffer.push(count as u8); - + key_buffer.extend_from_slice(fid_bytes); + key_buffer.push(word_count as u8); fid_word_count_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; } } - Ok(()) + sorter_into_reader(fid_word_count_docids_sorter, indexer) } diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index f211f7023..8b93ea23c 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -1,18 +1,19 @@ -use std::collections::HashSet; +use std::collections::{BTreeSet, HashSet}; use std::fs::File; use std::io::{self, BufReader}; use std::iter::FromIterator; +use obkv::KvReaderU16; use roaring::RoaringBitmap; use super::helpers::{ - create_sorter, merge_roaring_bitmaps, serialize_roaring_bitmap, sorter_into_reader, - try_split_array_at, GrenadParameters, + create_sorter, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, serialize_roaring_bitmap, + sorter_into_reader, try_split_array_at, GrenadParameters, }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; -use crate::update::index_documents::helpers::read_u32_ne_bytes; -use crate::{relative_from_absolute_position, FieldId, Result}; +use crate::update::MergeFn; +use crate::{DocumentId, FieldId, Result}; /// Extracts the word and the documents ids where this word appear. /// @@ -26,7 +27,11 @@ pub fn extract_word_docids( docid_word_positions: grenad::Reader, indexer: GrenadParameters, exact_attributes: &HashSet, -) -> Result<(grenad::Reader>, grenad::Reader>)> { +) -> Result<( + grenad::Reader>, + grenad::Reader>, + grenad::Reader>, +)> { puffin::profile_function!(); let max_memory = indexer.max_memory_by_thread(); @@ -37,7 +42,7 @@ pub fn extract_word_docids( indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, - max_memory.map(|x| x / 2), + max_memory.map(|x| x / 3), ); let mut exact_word_docids_sorter = create_sorter( @@ -46,45 +51,116 @@ pub fn extract_word_docids( indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, - max_memory.map(|x| x / 2), + max_memory.map(|x| x / 3), ); + let mut word_fid_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, + merge_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory.map(|x| x / 3), + ); + + let mut current_document_id = None; + let mut fid = 0; + let mut key_buffer = Vec::new(); let mut value_buffer = Vec::new(); + let mut words = BTreeSet::new(); + let mut exact_words = BTreeSet::new(); let mut cursor = docid_word_positions.into_cursor()?; - while let Some((key, positions)) = cursor.move_on_next()? { - let (document_id_bytes, word_bytes) = try_split_array_at(key) + while let Some((key, value)) = cursor.move_on_next()? { + let (document_id_bytes, fid_bytes) = try_split_array_at(key) + .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; + let (fid_bytes, _) = try_split_array_at(key) .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = u32::from_be_bytes(document_id_bytes); + fid = u16::from_be_bytes(fid_bytes); - let bitmap = RoaringBitmap::from_iter(Some(document_id)); - serialize_roaring_bitmap(&bitmap, &mut value_buffer)?; + // drain the btreemaps when we change document. + if current_document_id.map_or(false, |id| id != document_id) { + words_into_sorters( + document_id, + fid, + &mut key_buffer, + &mut value_buffer, + &mut exact_words, + &mut words, + &mut exact_word_docids_sorter, + &mut word_docids_sorter, + &mut word_fid_docids_sorter, + )?; + } - // If there are no exact attributes, we do not need to iterate over positions. - if exact_attributes.is_empty() { - word_docids_sorter.insert(word_bytes, &value_buffer)?; + current_document_id = Some(document_id); + + // every words contained in an attribute set to exact must be pushed in the exact_words list. + if exact_attributes.contains(&fid) { + for (_pos, word) in KvReaderU16::new(&value).iter() { + exact_words.insert(word.to_vec()); + } } else { - let mut added_to_exact = false; - let mut added_to_word_docids = false; - for position in read_u32_ne_bytes(positions) { - // as soon as we know that this word had been to both readers, we don't need to - // iterate over the positions. - if added_to_exact && added_to_word_docids { - break; - } - let (fid, _) = relative_from_absolute_position(position); - if exact_attributes.contains(&fid) && !added_to_exact { - exact_word_docids_sorter.insert(word_bytes, &value_buffer)?; - added_to_exact = true; - } else if !added_to_word_docids { - word_docids_sorter.insert(word_bytes, &value_buffer)?; - added_to_word_docids = true; - } + for (_pos, word) in KvReaderU16::new(&value).iter() { + words.insert(word.to_vec()); } } } + // We must make sure that don't lose the current document field id + if let Some(document_id) = current_document_id { + words_into_sorters( + document_id, + fid, + &mut key_buffer, + &mut value_buffer, + &mut exact_words, + &mut words, + &mut exact_word_docids_sorter, + &mut word_docids_sorter, + &mut word_fid_docids_sorter, + )?; + } + Ok(( sorter_into_reader(word_docids_sorter, indexer)?, sorter_into_reader(exact_word_docids_sorter, indexer)?, + sorter_into_reader(word_fid_docids_sorter, indexer)?, )) } + +fn words_into_sorters( + document_id: DocumentId, + fid: FieldId, + key_buffer: &mut Vec, + value_buffer: &mut Vec, + exact_words: &mut BTreeSet>, + words: &mut BTreeSet>, + exact_word_docids_sorter: &mut grenad::Sorter, + word_docids_sorter: &mut grenad::Sorter, + word_fid_docids_sorter: &mut grenad::Sorter, +) -> Result<()> { + puffin::profile_function!(); + let bitmap = RoaringBitmap::from_iter(Some(document_id)); + serialize_roaring_bitmap(&bitmap, value_buffer)?; + for word_bytes in exact_words.iter() { + exact_word_docids_sorter.insert(word_bytes, &mut *value_buffer)?; + } + + for word_bytes in words.iter() { + word_docids_sorter.insert(word_bytes, &value_buffer)?; + } + + for word_bytes in (&*words | &*exact_words).iter() { + key_buffer.clear(); + key_buffer.extend_from_slice(&word_bytes); + key_buffer.push(0); + key_buffer.extend_from_slice(&fid.to_be_bytes()); + word_fid_docids_sorter.insert(word_bytes, &value_buffer)?; + } + + exact_words.clear(); + words.clear(); + + Ok(()) +} diff --git a/milli/src/update/index_documents/extract/extract_word_fid_docids.rs b/milli/src/update/index_documents/extract/extract_word_fid_docids.rs index 09f571038..dd4d42431 100644 --- a/milli/src/update/index_documents/extract/extract_word_fid_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_fid_docids.rs @@ -17,6 +17,8 @@ pub fn extract_word_fid_docids( ) -> Result>> { puffin::profile_function!(); + todo!("remove me"); + let max_memory = indexer.max_memory_by_thread(); let mut word_fid_docids_sorter = create_sorter( diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 9ddd5ff4c..41604ff4a 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -1,12 +1,14 @@ use std::cmp::Ordering; -use std::collections::{BinaryHeap, HashMap}; +use std::collections::HashMap; use std::fs::File; use std::io::BufReader; -use std::{cmp, io, mem, str, vec}; +use std::{cmp, io}; + +use obkv::KvReaderU16; use super::helpers::{ - create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, - try_split_array_at, GrenadParameters, MergeFn, + create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at, + GrenadParameters, MergeFn, }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; @@ -35,44 +37,59 @@ pub fn extract_word_pair_proximity_docids( max_memory.map(|m| m / 2), ); - // This map is assumed to not consume a lot of memory. - let mut document_word_positions_heap = BinaryHeap::new(); + let mut word_positions: Vec<(String, u16)> = Vec::with_capacity(MAX_DISTANCE as usize); + let mut word_pair_proximity = HashMap::new(); let mut current_document_id = None; let mut cursor = docid_word_positions.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { - let (document_id_bytes, word_bytes) = try_split_array_at(key) + let (document_id_bytes, _fid_bytes) = try_split_array_at(key) .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = u32::from_be_bytes(document_id_bytes); - let word = str::from_utf8(word_bytes)?; - let curr_document_id = *current_document_id.get_or_insert(document_id); - if curr_document_id != document_id { - let document_word_positions_heap = mem::take(&mut document_word_positions_heap); - document_word_positions_into_sorter( - curr_document_id, - document_word_positions_heap, - &mut word_pair_proximity_docids_sorter, - )?; - current_document_id = Some(document_id); - } + for (position, word) in KvReaderU16::new(&value).iter() { + // if we change document, we fill the sorter + if current_document_id.map_or(false, |id| id != document_id) { + while !word_positions.is_empty() { + word_positions_into_word_pair_proximity( + &mut word_positions, + &mut word_pair_proximity, + )?; + } - let word = word.to_string(); - let mut positions: Vec<_> = read_u32_ne_bytes(value).collect(); - positions.sort_unstable(); - let mut iter = positions.into_iter(); - if let Some(position) = iter.next() { - document_word_positions_heap.push(PeekedWordPosition { word, position, iter }); + document_word_positions_into_sorter( + document_id, + &word_pair_proximity, + &mut word_pair_proximity_docids_sorter, + )?; + word_pair_proximity.clear(); + word_positions.clear(); + } + + // drain the proximity window until the head word is considered close to the word we are inserting. + while word_positions.get(0).map_or(false, |(_w, p)| { + positions_proximity(*p as u32, position as u32) > MAX_DISTANCE + }) { + word_positions_into_word_pair_proximity( + &mut word_positions, + &mut word_pair_proximity, + )?; + } + + // insert the new word. + let word = std::str::from_utf8(word)?; + word_positions.push((word.to_string(), position)); } } if let Some(document_id) = current_document_id { - // We must make sure that don't lose the current document field id - // word count map if we break because we reached the end of the chunk. - let document_word_positions_heap = mem::take(&mut document_word_positions_heap); + while !word_positions.is_empty() { + word_positions_into_word_pair_proximity(&mut word_positions, &mut word_pair_proximity)?; + } + document_word_positions_into_sorter( document_id, - document_word_positions_heap, + &word_pair_proximity, &mut word_pair_proximity_docids_sorter, )?; } @@ -86,64 +103,13 @@ pub fn extract_word_pair_proximity_docids( /// close to each other. fn document_word_positions_into_sorter( document_id: DocumentId, - mut word_positions_heap: BinaryHeap>>, + word_pair_proximity: &HashMap<(String, String), u8>, word_pair_proximity_docids_sorter: &mut grenad::Sorter, ) -> Result<()> { - let mut word_pair_proximity = HashMap::new(); - let mut ordered_peeked_word_positions = Vec::new(); - while !word_positions_heap.is_empty() { - while let Some(peeked_word_position) = word_positions_heap.pop() { - ordered_peeked_word_positions.push(peeked_word_position); - if ordered_peeked_word_positions.len() == 7 { - break; - } - } - - if let Some((head, tail)) = ordered_peeked_word_positions.split_first() { - for PeekedWordPosition { word, position, .. } in tail { - let prox = positions_proximity(head.position, *position); - if prox > 0 && prox < MAX_DISTANCE { - word_pair_proximity - .entry((head.word.clone(), word.clone())) - .and_modify(|p| { - *p = cmp::min(*p, prox); - }) - .or_insert(prox); - } - } - - // Push the tail in the heap. - let tail_iter = ordered_peeked_word_positions.drain(1..); - word_positions_heap.extend(tail_iter); - - // Advance the head and push it in the heap. - if let Some(mut head) = ordered_peeked_word_positions.pop() { - if let Some(next_position) = head.iter.next() { - let prox = positions_proximity(head.position, next_position); - - if prox > 0 && prox < MAX_DISTANCE { - word_pair_proximity - .entry((head.word.clone(), head.word.clone())) - .and_modify(|p| { - *p = cmp::min(*p, prox); - }) - .or_insert(prox); - } - - word_positions_heap.push(PeekedWordPosition { - word: head.word, - position: next_position, - iter: head.iter, - }); - } - } - } - } - let mut key_buffer = Vec::new(); for ((w1, w2), prox) in word_pair_proximity { key_buffer.clear(); - key_buffer.push(prox as u8); + key_buffer.push(*prox as u8); key_buffer.extend_from_slice(w1.as_bytes()); key_buffer.push(0); key_buffer.extend_from_slice(w2.as_bytes()); @@ -154,6 +120,23 @@ fn document_word_positions_into_sorter( Ok(()) } +fn word_positions_into_word_pair_proximity( + word_positions: &mut Vec<(String, u16)>, + word_pair_proximity: &mut HashMap<(String, String), u8>, +) -> Result<()> { + let (head_word, head_position) = word_positions.remove(0); + for (word, position) in word_positions.iter() { + let prox = positions_proximity(head_position as u32, *position as u32) as u8; + word_pair_proximity + .entry((head_word.clone(), word.clone())) + .and_modify(|p| { + *p = cmp::min(*p, prox); + }) + .or_insert(prox); + } + Ok(()) +} + struct PeekedWordPosition { word: String, position: u32, diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index 94139ddf8..db2f6217f 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -1,13 +1,15 @@ use std::fs::File; use std::io::{self, BufReader}; +use obkv::KvReaderU16; + use super::helpers::{ - create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, - try_split_array_at, GrenadParameters, + create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at, + GrenadParameters, }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; -use crate::{bucketed_position, relative_from_absolute_position, DocumentId, Result}; +use crate::{bucketed_position, DocumentId, Result}; /// Extracts the word positions and the documents ids where this word appear. /// @@ -34,15 +36,14 @@ pub fn extract_word_position_docids( let mut key_buffer = Vec::new(); let mut cursor = docid_word_positions.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { - let (document_id_bytes, word_bytes) = try_split_array_at(key) + let (document_id_bytes, fid_bytes) = try_split_array_at(key) .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = DocumentId::from_be_bytes(document_id_bytes); - for position in read_u32_ne_bytes(value) { + for (position, word_bytes) in KvReaderU16::new(&value).iter() { key_buffer.clear(); key_buffer.extend_from_slice(word_bytes); key_buffer.push(0); - let (_, position) = relative_from_absolute_position(position); let position = bucketed_position(position); key_buffer.extend_from_slice(&position.to_be_bytes()); word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index f44eac8f5..a6cc04111 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -176,16 +176,23 @@ pub(crate) fn data_from_obkv_documents( spawn_extraction_task::< _, _, - Vec<(grenad::Reader>, grenad::Reader>)>, + Vec<( + grenad::Reader>, + grenad::Reader>, + grenad::Reader>, + )>, >( docid_word_positions_chunks.clone(), indexer, lmdb_writer_sx.clone(), move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes), merge_roaring_bitmaps, - |(word_docids_reader, exact_word_docids_reader)| TypedChunk::WordDocids { - word_docids_reader, - exact_word_docids_reader, + |(word_docids_reader, exact_word_docids_reader, word_fid_docids_reader)| { + TypedChunk::WordDocids { + word_docids_reader, + exact_word_docids_reader, + word_fid_docids_reader, + } }, "word-docids", ); @@ -199,15 +206,15 @@ pub(crate) fn data_from_obkv_documents( TypedChunk::WordPositionDocids, "word-position-docids", ); - spawn_extraction_task::<_, _, Vec>>>( - docid_word_positions_chunks, - indexer, - lmdb_writer_sx.clone(), - extract_word_fid_docids, - merge_cbo_roaring_bitmaps, - TypedChunk::WordFidDocids, - "word-fid-docids", - ); + // spawn_extraction_task::<_, _, Vec>>>( + // docid_word_positions_chunks, + // indexer, + // lmdb_writer_sx.clone(), + // extract_word_fid_docids, + // merge_cbo_roaring_bitmaps, + // TypedChunk::WordFidDocids, + // "word-fid-docids", + // ); spawn_extraction_task::<_, _, Vec>>>( docid_fid_facet_strings_chunks, diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index 582bf2a5b..6c3a81a0e 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -115,6 +115,32 @@ impl MergeableReader for Vec<(grenad::Reader>, grenad::Reader>, + grenad::Reader>, + grenad::Reader>, + )> +{ + type Output = ( + grenad::Reader>, + grenad::Reader>, + grenad::Reader>, + ); + + fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result { + let mut m1 = MergerBuilder::new(merge_fn); + let mut m2 = MergerBuilder::new(merge_fn); + let mut m3 = MergerBuilder::new(merge_fn); + for (r1, r2, r3) in self.into_iter() { + m1.push(r1)?; + m2.push(r2)?; + m3.push(r3)?; + } + Ok((m1.finish(params)?, m2.finish(params)?, m3.finish(params)?)) + } +} + struct MergerBuilder(grenad::MergerBuilder); impl MergerBuilder { diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 52aa1113e..58219f28c 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -406,13 +406,23 @@ where } let typed_chunk = match result? { - TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => { + TypedChunk::WordDocids { + word_docids_reader, + exact_word_docids_reader, + word_fid_docids_reader, + } => { let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? }; word_docids = Some(cloneable_chunk); let cloneable_chunk = unsafe { as_cloneable_grenad(&exact_word_docids_reader)? }; exact_word_docids = Some(cloneable_chunk); - TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } + let cloneable_chunk = unsafe { as_cloneable_grenad(&word_fid_docids_reader)? }; + word_fid_docids = Some(cloneable_chunk); + TypedChunk::WordDocids { + word_docids_reader, + exact_word_docids_reader, + word_fid_docids_reader, + } } TypedChunk::WordPairProximityDocids(chunk) => { let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 5895a69c5..d57484cab 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -32,6 +32,7 @@ pub(crate) enum TypedChunk { WordDocids { word_docids_reader: grenad::Reader>, exact_word_docids_reader: grenad::Reader>, + word_fid_docids_reader: grenad::Reader>, }, WordPositionDocids(grenad::Reader>), WordFidDocids(grenad::Reader>), @@ -64,10 +65,15 @@ impl TypedChunk { TypedChunk::NewDocumentsIds(grenad) => { format!("NewDocumentsIds {{ number_of_entries: {} }}", grenad.len()) } - TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => format!( - "WordDocids {{ word_docids_reader: {}, exact_word_docids_reader: {} }}", + TypedChunk::WordDocids { + word_docids_reader, + exact_word_docids_reader, + word_fid_docids_reader, + } => format!( + "WordDocids {{ word_docids_reader: {}, exact_word_docids_reader: {}, word_fid_docids_reader: {} }}", word_docids_reader.len(), - exact_word_docids_reader.len() + exact_word_docids_reader.len(), + word_fid_docids_reader.len() ), TypedChunk::WordPositionDocids(grenad) => { format!("WordPositionDocids {{ number_of_entries: {} }}", grenad.len()) @@ -138,7 +144,11 @@ pub(crate) fn write_typed_chunk_into_index( TypedChunk::NewDocumentsIds(documents_ids) => { return Ok((documents_ids, is_merged_database)) } - TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => { + TypedChunk::WordDocids { + word_docids_reader, + exact_word_docids_reader, + word_fid_docids_reader, + } => { let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?; append_entries_into_database( word_docids_iter.clone(), @@ -159,6 +169,16 @@ pub(crate) fn write_typed_chunk_into_index( merge_roaring_bitmaps, )?; + let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?; + append_entries_into_database( + word_fid_docids_iter, + &index.word_fid_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_cbo_roaring_bitmaps, + )?; + // create fst from word docids let fst = merge_word_docids_reader_into_fst(word_docids_iter, exact_word_docids_iter)?; let db_fst = index.words_fst(wtxn)?; From 748b333161729c63a5611b2b56dbe99544648b1d Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 19 Sep 2023 14:12:43 +0200 Subject: [PATCH 002/127] Add usefull debug assert before key insertion in database --- milli/src/update/index_documents/typed_chunk.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index d57484cab..a450b5f34 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -475,6 +475,7 @@ where R: io::Read + io::Seek, FS: for<'a> Fn(&'a [u8], &'a mut Vec) -> Result<&'a [u8]>, FM: Fn(&[u8], &[u8], &mut Vec) -> Result<()>, + K: for<'a> heed::BytesDecode<'a>, { puffin::profile_function!(format!("number of entries: {}", data.len())); @@ -495,6 +496,12 @@ where let mut cursor = data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { if valid_lmdb_key(key) { + debug_assert!( + K::bytes_decode(&key).is_some(), + "Couldn't decode key with the database decoder, key length: {} - key bytes: {:x?}", + key.len(), + &key + ); buffer.clear(); let value = serialize_value(value, &mut buffer)?; unsafe { database.append(key, value)? }; From 8d77736a6795cba0e2eff9727015928f0aa13c3b Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 19 Sep 2023 14:20:57 +0200 Subject: [PATCH 003/127] Fix fid_word_docids --- milli/src/update/index_documents/extract/extract_word_docids.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index 8b93ea23c..8c72ba48a 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -156,7 +156,7 @@ fn words_into_sorters( key_buffer.extend_from_slice(&word_bytes); key_buffer.push(0); key_buffer.extend_from_slice(&fid.to_be_bytes()); - word_fid_docids_sorter.insert(word_bytes, &value_buffer)?; + word_fid_docids_sorter.insert(&key_buffer, &value_buffer)?; } exact_words.clear(); From 11ea5acff94b70ab0181010a277c663630b5d1d4 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 19 Sep 2023 18:29:21 +0200 Subject: [PATCH 004/127] Fix --- .../extract/extract_word_docids.rs | 2 +- .../extract_word_pair_proximity_docids.rs | 32 +++++++++---------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index 8c72ba48a..84c6f8635 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -73,7 +73,7 @@ pub fn extract_word_docids( while let Some((key, value)) = cursor.move_on_next()? { let (document_id_bytes, fid_bytes) = try_split_array_at(key) .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; - let (fid_bytes, _) = try_split_array_at(key) + let (fid_bytes, _) = try_split_array_at(fid_bytes) .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = u32::from_be_bytes(document_id_bytes); fid = u16::from_be_bytes(fid_bytes); diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 41604ff4a..6373d5822 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -47,25 +47,25 @@ pub fn extract_word_pair_proximity_docids( .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = u32::from_be_bytes(document_id_bytes); - for (position, word) in KvReaderU16::new(&value).iter() { - // if we change document, we fill the sorter - if current_document_id.map_or(false, |id| id != document_id) { - while !word_positions.is_empty() { - word_positions_into_word_pair_proximity( - &mut word_positions, - &mut word_pair_proximity, - )?; - } - - document_word_positions_into_sorter( - document_id, - &word_pair_proximity, - &mut word_pair_proximity_docids_sorter, + // if we change document, we fill the sorter + if current_document_id.map_or(false, |id| id != document_id) { + while !word_positions.is_empty() { + word_positions_into_word_pair_proximity( + &mut word_positions, + &mut word_pair_proximity, )?; - word_pair_proximity.clear(); - word_positions.clear(); } + document_word_positions_into_sorter( + document_id, + &word_pair_proximity, + &mut word_pair_proximity_docids_sorter, + )?; + word_pair_proximity.clear(); + word_positions.clear(); + } + + for (position, word) in KvReaderU16::new(&value).iter() { // drain the proximity window until the head word is considered close to the word we are inserting. while word_positions.get(0).map_or(false, |(_w, p)| { positions_proximity(*p as u32, position as u32) > MAX_DISTANCE From db1ca2123103f77589d28868d76bff87c3fd567c Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 19 Sep 2023 18:40:35 +0200 Subject: [PATCH 005/127] add puffin in sorter into reeder function --- milli/src/update/index_documents/helpers/grenad_helpers.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index 6c3a81a0e..cc0ccb609 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -54,6 +54,7 @@ pub fn sorter_into_reader( sorter: grenad::Sorter, indexer: GrenadParameters, ) -> Result>> { + puffin::profile_function!(); let mut writer = create_writer( indexer.chunk_compression_type, indexer.chunk_compression_level, From 8ccf32d1a06f4225c752be2554b28791704a5254 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 19 Sep 2023 19:29:06 +0200 Subject: [PATCH 006/127] Compute word_fid_docids before word_docids and exact_word_docids --- .../extract/extract_word_docids.rs | 139 +++++++++--------- 1 file changed, 68 insertions(+), 71 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index 84c6f8635..8409f2836 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -3,14 +3,16 @@ use std::fs::File; use std::io::{self, BufReader}; use std::iter::FromIterator; +use heed::BytesDecode; use obkv::KvReaderU16; use roaring::RoaringBitmap; use super::helpers::{ - create_sorter, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, serialize_roaring_bitmap, - sorter_into_reader, try_split_array_at, GrenadParameters, + create_sorter, create_writer, merge_roaring_bitmaps, serialize_roaring_bitmap, + sorter_into_reader, try_split_array_at, writer_into_reader, GrenadParameters, }; use crate::error::SerializationError; +use crate::heed_codec::StrBEU16Codec; use crate::index::db_name::DOCID_WORD_POSITIONS; use crate::update::MergeFn; use crate::{DocumentId, FieldId, Result}; @@ -36,6 +38,59 @@ pub fn extract_word_docids( let max_memory = indexer.max_memory_by_thread(); + let mut word_fid_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, + merge_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory.map(|x| x / 3), + ); + + let mut current_document_id = None; + let mut fid = 0; + let mut key_buffer = Vec::new(); + let mut value_buffer = Vec::new(); + let mut words = BTreeSet::new(); + let mut cursor = docid_word_positions.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + let (document_id_bytes, fid_bytes) = try_split_array_at(key) + .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; + let (fid_bytes, _) = try_split_array_at(fid_bytes) + .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; + let document_id = u32::from_be_bytes(document_id_bytes); + fid = u16::from_be_bytes(fid_bytes); + + // drain the btreemaps when we change document. + if current_document_id.map_or(false, |id| id != document_id) { + words_into_sorter( + document_id, + fid, + &mut key_buffer, + &mut value_buffer, + &mut words, + &mut word_fid_docids_sorter, + )?; + } + + current_document_id = Some(document_id); + for (_pos, word) in KvReaderU16::new(&value).iter() { + words.insert(word.to_vec()); + } + } + + // We must make sure that don't lose the current document field id + if let Some(document_id) = current_document_id { + words_into_sorter( + document_id, + fid, + &mut key_buffer, + &mut value_buffer, + &mut words, + &mut word_fid_docids_sorter, + )?; + } + let mut word_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, merge_roaring_bitmaps, @@ -54,104 +109,47 @@ pub fn extract_word_docids( max_memory.map(|x| x / 3), ); - let mut word_fid_docids_sorter = create_sorter( - grenad::SortAlgorithm::Unstable, - merge_roaring_bitmaps, + let mut word_fid_docids_writer = create_writer( indexer.chunk_compression_type, indexer.chunk_compression_level, - indexer.max_nb_chunks, - max_memory.map(|x| x / 3), + tempfile::tempfile()?, ); - let mut current_document_id = None; - let mut fid = 0; - let mut key_buffer = Vec::new(); - let mut value_buffer = Vec::new(); - let mut words = BTreeSet::new(); - let mut exact_words = BTreeSet::new(); - let mut cursor = docid_word_positions.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { - let (document_id_bytes, fid_bytes) = try_split_array_at(key) - .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; - let (fid_bytes, _) = try_split_array_at(fid_bytes) - .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; - let document_id = u32::from_be_bytes(document_id_bytes); - fid = u16::from_be_bytes(fid_bytes); + let mut iter = word_fid_docids_sorter.into_stream_merger_iter()?; + while let Some((key, value)) = iter.next()? { + word_fid_docids_writer.insert(key, value)?; - // drain the btreemaps when we change document. - if current_document_id.map_or(false, |id| id != document_id) { - words_into_sorters( - document_id, - fid, - &mut key_buffer, - &mut value_buffer, - &mut exact_words, - &mut words, - &mut exact_word_docids_sorter, - &mut word_docids_sorter, - &mut word_fid_docids_sorter, - )?; - } - - current_document_id = Some(document_id); + let (word, fid) = StrBEU16Codec::bytes_decode(key) + .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; // every words contained in an attribute set to exact must be pushed in the exact_words list. if exact_attributes.contains(&fid) { - for (_pos, word) in KvReaderU16::new(&value).iter() { - exact_words.insert(word.to_vec()); - } + exact_word_docids_sorter.insert(word.as_bytes(), &value)?; } else { - for (_pos, word) in KvReaderU16::new(&value).iter() { - words.insert(word.to_vec()); - } + word_docids_sorter.insert(word.as_bytes(), &value)?; } } - // We must make sure that don't lose the current document field id - if let Some(document_id) = current_document_id { - words_into_sorters( - document_id, - fid, - &mut key_buffer, - &mut value_buffer, - &mut exact_words, - &mut words, - &mut exact_word_docids_sorter, - &mut word_docids_sorter, - &mut word_fid_docids_sorter, - )?; - } - Ok(( sorter_into_reader(word_docids_sorter, indexer)?, sorter_into_reader(exact_word_docids_sorter, indexer)?, - sorter_into_reader(word_fid_docids_sorter, indexer)?, + writer_into_reader(word_fid_docids_writer)?, )) } -fn words_into_sorters( +fn words_into_sorter( document_id: DocumentId, fid: FieldId, key_buffer: &mut Vec, value_buffer: &mut Vec, - exact_words: &mut BTreeSet>, words: &mut BTreeSet>, - exact_word_docids_sorter: &mut grenad::Sorter, - word_docids_sorter: &mut grenad::Sorter, word_fid_docids_sorter: &mut grenad::Sorter, ) -> Result<()> { puffin::profile_function!(); let bitmap = RoaringBitmap::from_iter(Some(document_id)); serialize_roaring_bitmap(&bitmap, value_buffer)?; - for word_bytes in exact_words.iter() { - exact_word_docids_sorter.insert(word_bytes, &mut *value_buffer)?; - } for word_bytes in words.iter() { - word_docids_sorter.insert(word_bytes, &value_buffer)?; - } - - for word_bytes in (&*words | &*exact_words).iter() { key_buffer.clear(); key_buffer.extend_from_slice(&word_bytes); key_buffer.push(0); @@ -159,7 +157,6 @@ fn words_into_sorters( word_fid_docids_sorter.insert(&key_buffer, &value_buffer)?; } - exact_words.clear(); words.clear(); Ok(()) From b541d48847ff81382a69a925eac702356aa85287 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 21 Sep 2023 10:02:08 +0200 Subject: [PATCH 007/127] Add buffer to the obkv writter --- .../extract/extract_docid_word_positions.rs | 5 +++- .../extract/extract_word_position_docids.rs | 30 ++++++++++++++++--- 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 6aa66c92a..ea329b212 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -156,7 +156,8 @@ fn extract_tokens_from_document( let tokens = process_tokens(tokenizer.tokenize(field)) .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); - let mut writer = KvWriterU16::memory(); + buffers.obkv_buffer.clear(); + let mut writer = KvWriterU16::new(&mut buffers.obkv_buffer); for (index, token) in tokens { // if a language has been detected for the token, we update the counter. if let Some(language) = token.language { @@ -294,4 +295,6 @@ struct Buffers { key_buffer: Vec, // the field buffer for each fields desserialization, and must be cleared between each field. field_buffer: String, + // buffer used to store the value data containing an obkv. + obkv_buffer: Vec, } diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index db2f6217f..0b07f63b5 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -1,3 +1,4 @@ +use std::collections::HashSet; use std::fs::File; use std::io::{self, BufReader}; @@ -33,18 +34,39 @@ pub fn extract_word_position_docids( max_memory, ); + let mut word_positions: HashSet<(u16, Vec)> = HashSet::new(); + let mut current_document_id = None; let mut key_buffer = Vec::new(); let mut cursor = docid_word_positions.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { - let (document_id_bytes, fid_bytes) = try_split_array_at(key) + let (document_id_bytes, _fid_bytes) = try_split_array_at(key) .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = DocumentId::from_be_bytes(document_id_bytes); + if current_document_id.map_or(false, |id| document_id != id) { + for (position, word_bytes) in word_positions.iter() { + key_buffer.clear(); + key_buffer.extend_from_slice(word_bytes); + key_buffer.push(0); + key_buffer.extend_from_slice(&position.to_be_bytes()); + word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; + } + word_positions.clear(); + } + + current_document_id = Some(document_id); + for (position, word_bytes) in KvReaderU16::new(&value).iter() { - key_buffer.clear(); - key_buffer.extend_from_slice(word_bytes); - key_buffer.push(0); let position = bucketed_position(position); + word_positions.insert((position, word_bytes.to_vec())); + } + } + + if let Some(document_id) = current_document_id { + for (position, word_bytes) in word_positions { + key_buffer.clear(); + key_buffer.extend_from_slice(&word_bytes); + key_buffer.push(0); key_buffer.extend_from_slice(&position.to_be_bytes()); word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; } From df9e5c8651d6043d522d425457b2e9559ddb8224 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 25 Sep 2023 16:39:32 +0200 Subject: [PATCH 008/127] Generalize usage of CboRoaringBitmap codec to ease the use --- milli/src/index.rs | 8 ++--- milli/src/search/new/db_cache.rs | 8 ++--- milli/src/update/delete_documents.rs | 4 +-- .../extract/extract_docid_word_positions.rs | 6 ++++ .../extract/extract_word_docids.rs | 35 +++++-------------- .../extract_word_pair_proximity_docids.rs | 4 ++- .../extract/extract_word_position_docids.rs | 5 +-- .../src/update/index_documents/extract/mod.rs | 2 +- milli/src/update/index_documents/mod.rs | 6 ++-- .../src/update/index_documents/typed_chunk.rs | 4 +-- milli/src/update/word_prefix_docids.rs | 16 ++++----- 11 files changed, 44 insertions(+), 54 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index d563f852b..288223a95 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -119,16 +119,16 @@ pub struct Index { pub(crate) main: PolyDatabase, /// A word and all the documents ids containing the word. - pub word_docids: Database, + pub word_docids: Database, /// A word and all the documents ids containing the word, from attributes for which typos are not allowed. - pub exact_word_docids: Database, + pub exact_word_docids: Database, /// A prefix of word and all the documents ids containing this prefix. - pub word_prefix_docids: Database, + pub word_prefix_docids: Database, /// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed. - pub exact_word_prefix_docids: Database, + pub exact_word_prefix_docids: Database, /// Maps the proximity between a pair of words with all the docids where this relation appears. pub word_pair_proximity_docids: Database, diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index e0a2ba3cf..3f4751185 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -168,7 +168,7 @@ impl<'ctx> SearchContext<'ctx> { merge_cbo_roaring_bitmaps, ) } - None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>( + None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( self.txn, word, self.word_interner.get(word).as_str(), @@ -182,7 +182,7 @@ impl<'ctx> SearchContext<'ctx> { &mut self, word: Interned, ) -> Result> { - DatabaseCache::get_value::<_, _, RoaringBitmapCodec>( + DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( self.txn, word, self.word_interner.get(word).as_str(), @@ -230,7 +230,7 @@ impl<'ctx> SearchContext<'ctx> { merge_cbo_roaring_bitmaps, ) } - None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>( + None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( self.txn, prefix, self.word_interner.get(prefix).as_str(), @@ -244,7 +244,7 @@ impl<'ctx> SearchContext<'ctx> { &mut self, prefix: Interned, ) -> Result> { - DatabaseCache::get_value::<_, _, RoaringBitmapCodec>( + DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( self.txn, prefix, self.word_interner.get(prefix).as_str(), diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 164ad0c7e..c3b2cf1a3 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -499,7 +499,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { fn remove_from_word_prefix_docids( txn: &mut heed::RwTxn, - db: &Database, + db: &Database, to_remove: &RoaringBitmap, ) -> Result>> { puffin::profile_function!(); @@ -529,7 +529,7 @@ fn remove_from_word_prefix_docids( fn remove_from_word_docids( txn: &mut heed::RwTxn, - db: &heed::Database, + db: &heed::Database, to_remove: &RoaringBitmap, words_to_keep: &mut BTreeSet, words_to_remove: &mut BTreeSet, diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index ea329b212..a45d488e4 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -107,6 +107,12 @@ pub fn extract_docid_word_positions( if let Some(stop_words) = stop_words { tokenizer_builder.stop_words(stop_words); } + if let Some(dictionary) = dictionary { + tokenizer_builder.words_dict(dictionary); + } + if let Some(separators) = allowed_separators { + tokenizer_builder.separators(separators); + } tokenizer_builder.allow_list(&script_language); let tokenizer = tokenizer_builder.build(); diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index 8409f2836..d9fb72cc2 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -8,7 +8,7 @@ use obkv::KvReaderU16; use roaring::RoaringBitmap; use super::helpers::{ - create_sorter, create_writer, merge_roaring_bitmaps, serialize_roaring_bitmap, + create_sorter, create_writer, merge_cbo_roaring_bitmaps, serialize_roaring_bitmap, sorter_into_reader, try_split_array_at, writer_into_reader, GrenadParameters, }; use crate::error::SerializationError; @@ -40,15 +40,12 @@ pub fn extract_word_docids( let mut word_fid_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_roaring_bitmaps, + merge_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory.map(|x| x / 3), ); - - let mut current_document_id = None; - let mut fid = 0; let mut key_buffer = Vec::new(); let mut value_buffer = Vec::new(); let mut words = BTreeSet::new(); @@ -59,28 +56,12 @@ pub fn extract_word_docids( let (fid_bytes, _) = try_split_array_at(fid_bytes) .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = u32::from_be_bytes(document_id_bytes); - fid = u16::from_be_bytes(fid_bytes); + let fid = u16::from_be_bytes(fid_bytes); - // drain the btreemaps when we change document. - if current_document_id.map_or(false, |id| id != document_id) { - words_into_sorter( - document_id, - fid, - &mut key_buffer, - &mut value_buffer, - &mut words, - &mut word_fid_docids_sorter, - )?; - } - - current_document_id = Some(document_id); for (_pos, word) in KvReaderU16::new(&value).iter() { words.insert(word.to_vec()); } - } - // We must make sure that don't lose the current document field id - if let Some(document_id) = current_document_id { words_into_sorter( document_id, fid, @@ -89,11 +70,13 @@ pub fn extract_word_docids( &mut words, &mut word_fid_docids_sorter, )?; + + words.clear(); } let mut word_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_roaring_bitmaps, + merge_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, @@ -102,7 +85,7 @@ pub fn extract_word_docids( let mut exact_word_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_roaring_bitmaps, + merge_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, @@ -146,15 +129,13 @@ fn words_into_sorter( word_fid_docids_sorter: &mut grenad::Sorter, ) -> Result<()> { puffin::profile_function!(); - let bitmap = RoaringBitmap::from_iter(Some(document_id)); - serialize_roaring_bitmap(&bitmap, value_buffer)?; for word_bytes in words.iter() { key_buffer.clear(); key_buffer.extend_from_slice(&word_bytes); key_buffer.push(0); key_buffer.extend_from_slice(&fid.to_be_bytes()); - word_fid_docids_sorter.insert(&key_buffer, &value_buffer)?; + word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; } words.clear(); diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 6373d5822..d54513786 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -57,7 +57,7 @@ pub fn extract_word_pair_proximity_docids( } document_word_positions_into_sorter( - document_id, + current_document_id.unwrap(), &word_pair_proximity, &mut word_pair_proximity_docids_sorter, )?; @@ -65,6 +65,8 @@ pub fn extract_word_pair_proximity_docids( word_positions.clear(); } + current_document_id = Some(document_id); + for (position, word) in KvReaderU16::new(&value).iter() { // drain the proximity window until the head word is considered close to the word we are inserting. while word_positions.get(0).map_or(false, |(_w, p)| { diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index 0b07f63b5..220dca960 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -35,7 +35,7 @@ pub fn extract_word_position_docids( ); let mut word_positions: HashSet<(u16, Vec)> = HashSet::new(); - let mut current_document_id = None; + let mut current_document_id: Option = None; let mut key_buffer = Vec::new(); let mut cursor = docid_word_positions.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { @@ -49,7 +49,8 @@ pub fn extract_word_position_docids( key_buffer.extend_from_slice(word_bytes); key_buffer.push(0); key_buffer.extend_from_slice(&position.to_be_bytes()); - word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; + word_position_docids_sorter + .insert(&key_buffer, current_document_id.unwrap().to_ne_bytes())?; } word_positions.clear(); } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index a6cc04111..32ec6fe5c 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -186,7 +186,7 @@ pub(crate) fn data_from_obkv_documents( indexer, lmdb_writer_sx.clone(), move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes), - merge_roaring_bitmaps, + merge_cbo_roaring_bitmaps, |(word_docids_reader, exact_word_docids_reader, word_fid_docids_reader)| { TypedChunk::WordDocids { word_docids_reader, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 58219f28c..22e42937f 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -38,7 +38,7 @@ use crate::update::{ self, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst, }; -use crate::{Index, Result, RoaringBitmapCodec}; +use crate::{CboRoaringBitmapCodec, Index, Result, RoaringBitmapCodec}; static MERGED_DATABASE_COUNT: usize = 7; static PREFIX_DATABASE_COUNT: usize = 5; @@ -700,8 +700,8 @@ where fn execute_word_prefix_docids( txn: &mut heed::RwTxn, reader: grenad::Reader>, - word_docids_db: Database, - word_prefix_docids_db: Database, + word_docids_db: Database, + word_prefix_docids_db: Database, indexer_config: &IndexerConfig, new_prefix_fst_words: &[String], common_prefix_fst_words: &[&[String]], diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index a450b5f34..cf3194255 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -156,7 +156,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, |value, _buffer| Ok(value), - merge_roaring_bitmaps, + merge_cbo_roaring_bitmaps, )?; let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?; @@ -166,7 +166,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, |value, _buffer| Ok(value), - merge_roaring_bitmaps, + merge_cbo_roaring_bitmaps, )?; let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?; diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index a30254994..980bab01a 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -5,15 +5,15 @@ use heed::types::{ByteSlice, Str}; use heed::Database; use crate::update::index_documents::{ - create_sorter, merge_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, + create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, CursorClonableMmap, MergeFn, }; -use crate::{Result, RoaringBitmapCodec}; +use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec}; pub struct WordPrefixDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, - word_docids: Database, - word_prefix_docids: Database, + word_docids: Database, + word_prefix_docids: Database, pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, pub(crate) max_nb_chunks: Option, @@ -23,8 +23,8 @@ pub struct WordPrefixDocids<'t, 'u, 'i> { impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { pub fn new( wtxn: &'t mut heed::RwTxn<'i, 'u>, - word_docids: Database, - word_prefix_docids: Database, + word_docids: Database, + word_prefix_docids: Database, ) -> WordPrefixDocids<'t, 'u, 'i> { WordPrefixDocids { wtxn, @@ -51,7 +51,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { // and write into it at the same time, therefore we write into another file. let mut prefix_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_roaring_bitmaps, + merge_cbo_roaring_bitmaps, self.chunk_compression_type, self.chunk_compression_level, self.max_nb_chunks, @@ -115,7 +115,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { self.wtxn, *self.word_prefix_docids.as_polymorph(), prefix_docids_sorter, - merge_roaring_bitmaps, + merge_cbo_roaring_bitmaps, )?; Ok(()) From 96be85396d4fd0199683729b27091d6069b5efcf Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 25 Sep 2023 18:55:20 +0200 Subject: [PATCH 009/127] Use a vecDeque in wpp database --- .../extract_word_pair_proximity_docids.rs | 37 +++---------------- 1 file changed, 6 insertions(+), 31 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index d54513786..fb0ea1ca8 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -1,5 +1,5 @@ use std::cmp::Ordering; -use std::collections::HashMap; +use std::collections::{HashMap, VecDeque}; use std::fs::File; use std::io::BufReader; use std::{cmp, io}; @@ -37,7 +37,8 @@ pub fn extract_word_pair_proximity_docids( max_memory.map(|m| m / 2), ); - let mut word_positions: Vec<(String, u16)> = Vec::with_capacity(MAX_DISTANCE as usize); + let mut word_positions: VecDeque<(String, u16)> = + VecDeque::with_capacity(MAX_DISTANCE as usize); let mut word_pair_proximity = HashMap::new(); let mut current_document_id = None; @@ -80,7 +81,7 @@ pub fn extract_word_pair_proximity_docids( // insert the new word. let word = std::str::from_utf8(word)?; - word_positions.push((word.to_string(), position)); + word_positions.push_back((word.to_string(), position)); } } @@ -123,10 +124,10 @@ fn document_word_positions_into_sorter( } fn word_positions_into_word_pair_proximity( - word_positions: &mut Vec<(String, u16)>, + word_positions: &mut VecDeque<(String, u16)>, word_pair_proximity: &mut HashMap<(String, String), u8>, ) -> Result<()> { - let (head_word, head_position) = word_positions.remove(0); + let (head_word, head_position) = word_positions.pop_front().unwrap(); for (word, position) in word_positions.iter() { let prox = positions_proximity(head_position as u32, *position as u32) as u8; word_pair_proximity @@ -138,29 +139,3 @@ fn word_positions_into_word_pair_proximity( } Ok(()) } - -struct PeekedWordPosition { - word: String, - position: u32, - iter: I, -} - -impl Ord for PeekedWordPosition { - fn cmp(&self, other: &Self) -> Ordering { - self.position.cmp(&other.position).reverse() - } -} - -impl PartialOrd for PeekedWordPosition { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Eq for PeekedWordPosition {} - -impl PartialEq for PeekedWordPosition { - fn eq(&self, other: &Self) -> bool { - self.position == other.position - } -} From 28a8d0ccdac9a2ae3e0ed4bb91127b11df5c36b0 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 26 Sep 2023 10:08:36 +0200 Subject: [PATCH 010/127] Fix word pair proximity --- .../extract/extract_word_pair_proximity_docids.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index fb0ea1ca8..847da01c5 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -1,4 +1,3 @@ -use std::cmp::Ordering; use std::collections::{HashMap, VecDeque}; use std::fs::File; use std::io::BufReader; @@ -12,7 +11,7 @@ use super::helpers::{ }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; -use crate::proximity::{positions_proximity, MAX_DISTANCE}; +use crate::proximity::{index_proximity, MAX_DISTANCE}; use crate::{DocumentId, Result}; /// Extracts the best proximity between pairs of words and the documents ids where this pair appear. @@ -71,7 +70,7 @@ pub fn extract_word_pair_proximity_docids( for (position, word) in KvReaderU16::new(&value).iter() { // drain the proximity window until the head word is considered close to the word we are inserting. while word_positions.get(0).map_or(false, |(_w, p)| { - positions_proximity(*p as u32, position as u32) > MAX_DISTANCE + index_proximity(*p as u32, position as u32) >= MAX_DISTANCE }) { word_positions_into_word_pair_proximity( &mut word_positions, @@ -109,6 +108,7 @@ fn document_word_positions_into_sorter( word_pair_proximity: &HashMap<(String, String), u8>, word_pair_proximity_docids_sorter: &mut grenad::Sorter, ) -> Result<()> { + puffin::profile_function!(); let mut key_buffer = Vec::new(); for ((w1, w2), prox) in word_pair_proximity { key_buffer.clear(); @@ -127,9 +127,10 @@ fn word_positions_into_word_pair_proximity( word_positions: &mut VecDeque<(String, u16)>, word_pair_proximity: &mut HashMap<(String, String), u8>, ) -> Result<()> { + puffin::profile_function!(); let (head_word, head_position) = word_positions.pop_front().unwrap(); for (word, position) in word_positions.iter() { - let prox = positions_proximity(head_position as u32, *position as u32) as u8; + let prox = index_proximity(head_position as u32, *position as u32) as u8; word_pair_proximity .entry((head_word.clone(), word.clone())) .and_modify(|p| { From 66c2c82a18614208b5b47f9597aa8a8509d1e697 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 28 Sep 2023 10:45:25 +0200 Subject: [PATCH 011/127] Split wpp in several sorters --- .../extract_word_pair_proximity_docids.rs | 66 ++++++++++++------- 1 file changed, 43 insertions(+), 23 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 847da01c5..70865acbe 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -6,8 +6,8 @@ use std::{cmp, io}; use obkv::KvReaderU16; use super::helpers::{ - create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at, - GrenadParameters, MergeFn, + create_sorter, create_writer, merge_cbo_roaring_bitmaps, sorter_into_reader, + try_split_array_at, writer_into_reader, GrenadParameters, MergeFn, }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; @@ -27,14 +27,19 @@ pub fn extract_word_pair_proximity_docids( let max_memory = indexer.max_memory_by_thread(); - let mut word_pair_proximity_docids_sorter = create_sorter( - grenad::SortAlgorithm::Unstable, - merge_cbo_roaring_bitmaps, - indexer.chunk_compression_type, - indexer.chunk_compression_level, - indexer.max_nb_chunks, - max_memory.map(|m| m / 2), - ); + let mut word_pair_proximity_docids_sorters: Vec<_> = (1..MAX_DISTANCE) + .into_iter() + .map(|_| { + create_sorter( + grenad::SortAlgorithm::Unstable, + merge_cbo_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory.map(|m| m / MAX_DISTANCE as usize), + ) + }) + .collect(); let mut word_positions: VecDeque<(String, u16)> = VecDeque::with_capacity(MAX_DISTANCE as usize); @@ -49,6 +54,7 @@ pub fn extract_word_pair_proximity_docids( // if we change document, we fill the sorter if current_document_id.map_or(false, |id| id != document_id) { + puffin::profile_scope!("Document into sorter"); while !word_positions.is_empty() { word_positions_into_word_pair_proximity( &mut word_positions, @@ -59,7 +65,7 @@ pub fn extract_word_pair_proximity_docids( document_word_positions_into_sorter( current_document_id.unwrap(), &word_pair_proximity, - &mut word_pair_proximity_docids_sorter, + &mut word_pair_proximity_docids_sorters, )?; word_pair_proximity.clear(); word_positions.clear(); @@ -85,6 +91,7 @@ pub fn extract_word_pair_proximity_docids( } if let Some(document_id) = current_document_id { + puffin::profile_scope!("Final document into sorter"); while !word_positions.is_empty() { word_positions_into_word_pair_proximity(&mut word_positions, &mut word_pair_proximity)?; } @@ -92,11 +99,23 @@ pub fn extract_word_pair_proximity_docids( document_word_positions_into_sorter( document_id, &word_pair_proximity, - &mut word_pair_proximity_docids_sorter, + &mut word_pair_proximity_docids_sorters, )?; } + { + puffin::profile_scope!("sorter_into_reader"); + let mut writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); - sorter_into_reader(word_pair_proximity_docids_sorter, indexer) + for sorter in word_pair_proximity_docids_sorters { + sorter.write_into_stream_writer(&mut writer)?; + } + + writer_into_reader(writer) + } } /// Fills the list of all pairs of words with the shortest proximity between 1 and 7 inclusive. @@ -106,9 +125,8 @@ pub fn extract_word_pair_proximity_docids( fn document_word_positions_into_sorter( document_id: DocumentId, word_pair_proximity: &HashMap<(String, String), u8>, - word_pair_proximity_docids_sorter: &mut grenad::Sorter, + word_pair_proximity_docids_sorters: &mut Vec>, ) -> Result<()> { - puffin::profile_function!(); let mut key_buffer = Vec::new(); for ((w1, w2), prox) in word_pair_proximity { key_buffer.clear(); @@ -117,7 +135,8 @@ fn document_word_positions_into_sorter( key_buffer.push(0); key_buffer.extend_from_slice(w2.as_bytes()); - word_pair_proximity_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; + word_pair_proximity_docids_sorters[*prox as usize - 1] + .insert(&key_buffer, document_id.to_ne_bytes())?; } Ok(()) @@ -127,16 +146,17 @@ fn word_positions_into_word_pair_proximity( word_positions: &mut VecDeque<(String, u16)>, word_pair_proximity: &mut HashMap<(String, String), u8>, ) -> Result<()> { - puffin::profile_function!(); let (head_word, head_position) = word_positions.pop_front().unwrap(); for (word, position) in word_positions.iter() { let prox = index_proximity(head_position as u32, *position as u32) as u8; - word_pair_proximity - .entry((head_word.clone(), word.clone())) - .and_modify(|p| { - *p = cmp::min(*p, prox); - }) - .or_insert(prox); + if prox > 0 && prox < MAX_DISTANCE as u8 { + word_pair_proximity + .entry((head_word.clone(), word.clone())) + .and_modify(|p| { + *p = cmp::min(*p, prox); + }) + .or_insert(prox); + } } Ok(()) } From 1c5705c164bfe0771a11d611233be1e8790f0b6e Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 10 Oct 2023 11:23:16 +0200 Subject: [PATCH 012/127] clean PR warnings --- milli/src/search/new/db_cache.rs | 4 +- milli/src/update/delete_documents.rs | 4 +- .../extract/extract_docid_word_positions.rs | 4 +- .../extract/extract_word_docids.rs | 8 +-- .../extract/extract_word_fid_docids.rs | 53 ------------------- .../src/update/index_documents/extract/mod.rs | 15 +----- .../helpers/merge_functions.rs | 1 + .../src/update/index_documents/helpers/mod.rs | 1 + milli/src/update/index_documents/mod.rs | 7 +-- .../src/update/index_documents/typed_chunk.rs | 15 ------ milli/src/update/word_prefix_docids.rs | 2 +- 11 files changed, 11 insertions(+), 103 deletions(-) delete mode 100644 milli/src/update/index_documents/extract/extract_word_fid_docids.rs diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index 3f4751185..3376cebb2 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -11,9 +11,7 @@ use super::interner::Interned; use super::Word; use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec}; use crate::update::{merge_cbo_roaring_bitmaps, MergeFn}; -use crate::{ - CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, RoaringBitmapCodec, SearchContext, -}; +use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext}; /// A cache storing pointers to values in the LMDB databases. /// diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index c3b2cf1a3..1fef922cd 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -16,9 +16,7 @@ use crate::facet::FacetType; use crate::heed_codec::facet::FieldDocIdFacetCodec; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::Hnsw; -use crate::{ - ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec, BEU32, -}; +use crate::{ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, BEU32}; pub struct DeleteDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index a45d488e4..0c7c5cf46 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -12,9 +12,7 @@ use serde_json::Value; use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters}; use crate::error::{InternalError, SerializationError}; use crate::update::index_documents::MergeFn; -use crate::{ - absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH, -}; +use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH}; pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), RoaringBitmap>; diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index d9fb72cc2..3df962585 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -5,11 +5,10 @@ use std::iter::FromIterator; use heed::BytesDecode; use obkv::KvReaderU16; -use roaring::RoaringBitmap; use super::helpers::{ - create_sorter, create_writer, merge_cbo_roaring_bitmaps, serialize_roaring_bitmap, - sorter_into_reader, try_split_array_at, writer_into_reader, GrenadParameters, + create_sorter, create_writer, merge_cbo_roaring_bitmaps, sorter_into_reader, + try_split_array_at, writer_into_reader, GrenadParameters, }; use crate::error::SerializationError; use crate::heed_codec::StrBEU16Codec; @@ -47,7 +46,6 @@ pub fn extract_word_docids( max_memory.map(|x| x / 3), ); let mut key_buffer = Vec::new(); - let mut value_buffer = Vec::new(); let mut words = BTreeSet::new(); let mut cursor = docid_word_positions.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { @@ -66,7 +64,6 @@ pub fn extract_word_docids( document_id, fid, &mut key_buffer, - &mut value_buffer, &mut words, &mut word_fid_docids_sorter, )?; @@ -124,7 +121,6 @@ fn words_into_sorter( document_id: DocumentId, fid: FieldId, key_buffer: &mut Vec, - value_buffer: &mut Vec, words: &mut BTreeSet>, word_fid_docids_sorter: &mut grenad::Sorter, ) -> Result<()> { diff --git a/milli/src/update/index_documents/extract/extract_word_fid_docids.rs b/milli/src/update/index_documents/extract/extract_word_fid_docids.rs deleted file mode 100644 index dd4d42431..000000000 --- a/milli/src/update/index_documents/extract/extract_word_fid_docids.rs +++ /dev/null @@ -1,53 +0,0 @@ -use std::fs::File; -use std::io::{self, BufReader}; - -use super::helpers::{ - create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, - try_split_array_at, GrenadParameters, -}; -use crate::error::SerializationError; -use crate::index::db_name::DOCID_WORD_POSITIONS; -use crate::{relative_from_absolute_position, DocumentId, Result}; - -/// Extracts the word, field id, and the documents ids where this word appear at this field id. -#[logging_timer::time] -pub fn extract_word_fid_docids( - docid_word_positions: grenad::Reader, - indexer: GrenadParameters, -) -> Result>> { - puffin::profile_function!(); - - todo!("remove me"); - - let max_memory = indexer.max_memory_by_thread(); - - let mut word_fid_docids_sorter = create_sorter( - grenad::SortAlgorithm::Unstable, - merge_cbo_roaring_bitmaps, - indexer.chunk_compression_type, - indexer.chunk_compression_level, - indexer.max_nb_chunks, - max_memory, - ); - - let mut key_buffer = Vec::new(); - let mut cursor = docid_word_positions.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { - let (document_id_bytes, word_bytes) = try_split_array_at(key) - .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; - let document_id = DocumentId::from_be_bytes(document_id_bytes); - - for position in read_u32_ne_bytes(value) { - key_buffer.clear(); - key_buffer.extend_from_slice(word_bytes); - key_buffer.push(0); - let (fid, _) = relative_from_absolute_position(position); - key_buffer.extend_from_slice(&fid.to_be_bytes()); - word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; - } - } - - let word_fid_docids_reader = sorter_into_reader(word_fid_docids_sorter, indexer)?; - - Ok(word_fid_docids_reader) -} diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 32ec6fe5c..164f95452 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -6,7 +6,6 @@ mod extract_fid_word_count_docids; mod extract_geo_points; mod extract_vector_points; mod extract_word_docids; -mod extract_word_fid_docids; mod extract_word_pair_proximity_docids; mod extract_word_position_docids; @@ -26,12 +25,11 @@ use self::extract_fid_word_count_docids::extract_fid_word_count_docids; use self::extract_geo_points::extract_geo_points; use self::extract_vector_points::extract_vector_points; use self::extract_word_docids::extract_word_docids; -use self::extract_word_fid_docids::extract_word_fid_docids; use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; use self::extract_word_position_docids::extract_word_position_docids; use super::helpers::{ - as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap, - GrenadParameters, MergeFn, MergeableReader, + as_cloneable_grenad, merge_cbo_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, + MergeableReader, }; use super::{helpers, TypedChunk}; use crate::{FieldId, Result}; @@ -206,15 +204,6 @@ pub(crate) fn data_from_obkv_documents( TypedChunk::WordPositionDocids, "word-position-docids", ); - // spawn_extraction_task::<_, _, Vec>>>( - // docid_word_positions_chunks, - // indexer, - // lmdb_writer_sx.clone(), - // extract_word_fid_docids, - // merge_cbo_roaring_bitmaps, - // TypedChunk::WordFidDocids, - // "word-fid-docids", - // ); spawn_extraction_task::<_, _, Vec>>>( docid_fid_facet_strings_chunks, diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs index 5d111067a..90cfa0f60 100644 --- a/milli/src/update/index_documents/helpers/merge_functions.rs +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -11,6 +11,7 @@ use crate::Result; pub type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> Result>; +#[allow(unused)] pub fn concat_u32s_array<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { if values.len() == 1 { Ok(values[0].clone()) diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index d59a3bc08..3dc9f8172 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -44,6 +44,7 @@ where Some((head, tail)) } +#[allow(unused)] pub fn read_u32_ne_bytes(bytes: &[u8]) -> impl Iterator + '_ { bytes.chunks_exact(4).flat_map(TryInto::try_into).map(u32::from_ne_bytes) } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 22e42937f..e4385de70 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -38,7 +38,7 @@ use crate::update::{ self, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst, }; -use crate::{CboRoaringBitmapCodec, Index, Result, RoaringBitmapCodec}; +use crate::{CboRoaringBitmapCodec, Index, Result}; static MERGED_DATABASE_COUNT: usize = 7; static PREFIX_DATABASE_COUNT: usize = 5; @@ -434,11 +434,6 @@ where word_position_docids = Some(cloneable_chunk); TypedChunk::WordPositionDocids(chunk) } - TypedChunk::WordFidDocids(chunk) => { - let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; - word_fid_docids = Some(cloneable_chunk); - TypedChunk::WordFidDocids(chunk) - } otherwise => otherwise, }; diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index cf3194255..a94bcf581 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -35,7 +35,6 @@ pub(crate) enum TypedChunk { word_fid_docids_reader: grenad::Reader>, }, WordPositionDocids(grenad::Reader>), - WordFidDocids(grenad::Reader>), WordPairProximityDocids(grenad::Reader>), FieldIdFacetStringDocids(grenad::Reader>), FieldIdFacetNumberDocids(grenad::Reader>), @@ -78,9 +77,6 @@ impl TypedChunk { TypedChunk::WordPositionDocids(grenad) => { format!("WordPositionDocids {{ number_of_entries: {} }}", grenad.len()) } - TypedChunk::WordFidDocids(grenad) => { - format!("WordFidDocids {{ number_of_entries: {} }}", grenad.len()) - } TypedChunk::WordPairProximityDocids(grenad) => { format!("WordPairProximityDocids {{ number_of_entries: {} }}", grenad.len()) } @@ -202,17 +198,6 @@ pub(crate) fn write_typed_chunk_into_index( )?; is_merged_database = true; } - TypedChunk::WordFidDocids(word_fid_docids_iter) => { - append_entries_into_database( - word_fid_docids_iter, - &index.word_fid_docids, - wtxn, - index_is_empty, - |value, _buffer| Ok(value), - merge_cbo_roaring_bitmaps, - )?; - is_merged_database = true; - } TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => { let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter); indexer.execute(wtxn)?; diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index 980bab01a..8220aa777 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -8,7 +8,7 @@ use crate::update::index_documents::{ create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, CursorClonableMmap, MergeFn, }; -use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec}; +use crate::{CboRoaringBitmapCodec, Result}; pub struct WordPrefixDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, From f5ef69293bcf1ab643fc7c40d8543ddd4596a225 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 10 Oct 2023 16:17:03 +0200 Subject: [PATCH 013/127] deactivate prefix dbs --- milli/src/update/index_documents/mod.rs | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index e4385de70..703d7ee29 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -475,13 +475,14 @@ where let all_documents_ids = index_documents_ids | new_documents_ids; self.index.put_documents_ids(self.wtxn, &all_documents_ids)?; - self.execute_prefix_databases( - word_docids, - exact_word_docids, - word_pair_proximity_docids, - word_position_docids, - word_fid_docids, - )?; + // TODO: reactivate prefix DB with diff-indexing + // self.execute_prefix_databases( + // word_docids, + // exact_word_docids, + // word_pair_proximity_docids, + // word_position_docids, + // word_fid_docids, + // )?; Ok(all_documents_ids.len()) } From 1dd97578a821a6dcf6ffd4eac752fcab36c2c44b Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 12 Oct 2023 11:46:56 +0200 Subject: [PATCH 014/127] Make the transform struct return diff-based documents obkvs --- milli/src/update/del_add.rs | 60 +++++ .../helpers/merge_functions.rs | 126 ++++++--- .../src/update/index_documents/helpers/mod.rs | 4 +- milli/src/update/index_documents/transform.rs | 253 +++++++++++++----- milli/src/update/mod.rs | 1 + 5 files changed, 349 insertions(+), 95 deletions(-) create mode 100644 milli/src/update/del_add.rs diff --git a/milli/src/update/del_add.rs b/milli/src/update/del_add.rs new file mode 100644 index 000000000..e8e595837 --- /dev/null +++ b/milli/src/update/del_add.rs @@ -0,0 +1,60 @@ +use obkv::Key; + +pub type KvWriterDelAdd = obkv::KvWriter; +pub type KvReaderDelAdd<'a> = obkv::KvReader<'a, DelAdd>; + +/// DelAdd defines the new value to add in the database and old value to delete from the database. +/// +/// Its used in an OBKV to be serialized in grenad files. +#[repr(u8)] +#[derive(Clone, Copy, PartialOrd, PartialEq, Debug)] +pub enum DelAdd { + Deletion = 0, + Addition = 1, +} + +impl Key for DelAdd { + const BYTES_SIZE: usize = std::mem::size_of::(); + type BYTES = [u8; Self::BYTES_SIZE]; + + fn to_be_bytes(&self) -> Self::BYTES { + u8::to_be_bytes(*self as u8) + } + + fn from_be_bytes(array: Self::BYTES) -> Self { + match u8::from_be_bytes(array) { + 0 => Self::Deletion, + 1 => Self::Addition, + otherwise => unreachable!("DelAdd has only 2 variants, unknown variant: {}", otherwise), + } + } +} + +/// Creates a Kv> from Kv +/// +/// if deletion is `true`, the value will be inserted behind a DelAdd::Deletion key. +/// if addition is `true`, the value will be inserted behind a DelAdd::Addition key. +/// if both deletion and addition are `true, the value will be inserted in both keys. +pub fn into_del_add_obkv( + reader: obkv::KvReader, + deletion: bool, + addition: bool, + buffer: &mut Vec, +) -> Result<(), std::io::Error> { + let mut writer = obkv::KvWriter::new(buffer); + let mut value_buffer = Vec::new(); + for (key, value) in reader.iter() { + value_buffer.clear(); + let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); + if deletion { + value_writer.insert(DelAdd::Deletion, value)?; + } + if addition { + value_writer.insert(DelAdd::Addition, value)?; + } + value_writer.finish()?; + writer.insert(key, &value_buffer)?; + } + + writer.finish() +} diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs index 90cfa0f60..6317b5610 100644 --- a/milli/src/update/index_documents/helpers/merge_functions.rs +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -6,6 +6,7 @@ use std::result::Result as StdResult; use roaring::RoaringBitmap; use crate::heed_codec::CboRoaringBitmapCodec; +use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::index_documents::transform::Operation; use crate::Result; @@ -76,55 +77,118 @@ pub fn keep_latest_obkv<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result) { +pub fn merge_two_del_add_obkvs( + base: obkv::KvReaderU16, + update: obkv::KvReaderU16, + merge_additions: bool, + buffer: &mut Vec, +) { use itertools::merge_join_by; use itertools::EitherOrBoth::{Both, Left, Right}; buffer.clear(); let mut writer = obkv::KvWriter::new(buffer); + let mut value_buffer = Vec::new(); for eob in merge_join_by(base.iter(), update.iter(), |(b, _), (u, _)| b.cmp(u)) { match eob { - Both(_, (k, v)) | Left((k, v)) | Right((k, v)) => writer.insert(k, v).unwrap(), + Left((k, v)) => { + if merge_additions { + writer.insert(k, v).unwrap() + } else { + // If merge_additions is false, recreate an obkv keeping the deletions only. + value_buffer.clear(); + let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); + let base_reader = KvReaderDelAdd::new(v); + + if let Some(deletion) = base_reader.get(DelAdd::Deletion) { + value_writer.insert(DelAdd::Deletion, deletion).unwrap(); + value_writer.finish().unwrap(); + writer.insert(k, &value_buffer).unwrap() + } + } + } + Right((k, v)) => writer.insert(k, v).unwrap(), + Both((k, base), (_, update)) => { + // merge deletions and additions. + value_buffer.clear(); + let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); + let base_reader = KvReaderDelAdd::new(base); + let update_reader = KvReaderDelAdd::new(update); + + // keep newest deletion. + if let Some(deletion) = + update_reader.get(DelAdd::Deletion).or(base_reader.get(DelAdd::Deletion)) + { + value_writer.insert(DelAdd::Deletion, deletion).unwrap(); + } + + // keep base addition only if merge_additions is true. + let base_addition = + merge_additions.then(|| base_reader.get(DelAdd::Addition)).flatten(); + // keep newest addition. + if let Some(addition) = update_reader.get(DelAdd::Addition).or(base_addition) { + value_writer.insert(DelAdd::Addition, addition).unwrap(); + } + + value_writer.finish().unwrap(); + writer.insert(k, &value_buffer).unwrap() + } } } writer.finish().unwrap(); } -/// Merge all the obks in the order we see them. -pub fn merge_obkvs_and_operations<'a>( +/// Merge all the obkvs from the newest to the oldest. +fn inner_merge_del_add_obkvs<'a>( + obkvs: &[Cow<'a, [u8]>], + merge_additions: bool, +) -> Result> { + // pop the newest operation from the list. + let (newest, obkvs) = obkvs.split_last().unwrap(); + // keep the operation type for the returned value. + let newest_operation_type = newest[0]; + + // treat the newest obkv as the starting point of the merge. + let mut acc_operation_type = newest_operation_type; + let mut acc = newest[1..].to_vec(); + let mut buffer = Vec::new(); + // reverse iter from the most recent to the oldest. + for current in obkvs.into_iter().rev() { + // if in the previous iteration there was a complete deletion, + // stop the merge process. + if acc_operation_type == Operation::Deletion as u8 { + break; + } + + let newest = obkv::KvReader::new(&acc); + let oldest = obkv::KvReader::new(¤t[1..]); + merge_two_del_add_obkvs(oldest, newest, merge_additions, &mut buffer); + + // we want the result of the merge into our accumulator. + std::mem::swap(&mut acc, &mut buffer); + acc_operation_type = current[0]; + } + + acc.insert(0, newest_operation_type); + Ok(Cow::from(acc)) +} + +/// Merge all the obkvs from the newest to the oldest. +pub fn obkvs_merge_additions_and_deletions<'a>( _key: &[u8], obkvs: &[Cow<'a, [u8]>], ) -> Result> { - // [add, add, delete, add, add] - // we can ignore everything that happened before the last delete. - let starting_position = - obkvs.iter().rposition(|obkv| obkv[0] == Operation::Deletion as u8).unwrap_or(0); + inner_merge_del_add_obkvs(obkvs, true) +} - // [add, add, delete] - // if the last operation was a deletion then we simply return the deletion - if starting_position == obkvs.len() - 1 && obkvs.last().unwrap()[0] == Operation::Deletion as u8 - { - return Ok(obkvs[obkvs.len() - 1].clone()); - } - let mut buffer = Vec::new(); - - // (add, add, delete) [add, add] - // in the other case, no deletion will be encountered during the merge - let mut ret = - obkvs[starting_position..].iter().cloned().fold(Vec::new(), |mut acc, current| { - let first = obkv::KvReader::new(&acc); - let second = obkv::KvReader::new(¤t[1..]); - merge_two_obkvs(first, second, &mut buffer); - - // we want the result of the merge into our accumulator - std::mem::swap(&mut acc, &mut buffer); - acc - }); - - ret.insert(0, Operation::Addition as u8); - Ok(Cow::from(ret)) +/// Merge all the obkvs deletions from the newest to the oldest and keep only the newest additions. +pub fn obkvs_keep_last_addition_merge_deletions<'a>( + _key: &[u8], + obkvs: &[Cow<'a, [u8]>], +) -> Result> { + inner_merge_del_add_obkvs(obkvs, false) } pub fn merge_cbo_roaring_bitmaps<'a>( diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index 3dc9f8172..8f70a2de2 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -14,8 +14,8 @@ pub use grenad_helpers::{ }; pub use merge_functions::{ concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string, - merge_cbo_roaring_bitmaps, merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs, - serialize_roaring_bitmap, MergeFn, + merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, obkvs_keep_last_addition_merge_deletions, + obkvs_merge_additions_and_deletions, serialize_roaring_bitmap, MergeFn, }; use crate::MAX_WORD_LENGTH; diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index f0e3bbbf0..a45a6ee3c 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -7,18 +7,20 @@ use std::io::{Read, Seek}; use fxhash::FxHashMap; use heed::RoTxn; use itertools::Itertools; -use obkv::{KvReader, KvWriter}; +use obkv::{KvReader, KvReaderU16, KvWriter}; use roaring::RoaringBitmap; use serde_json::Value; use smartstring::SmartString; use super::helpers::{ - create_sorter, create_writer, keep_latest_obkv, merge_obkvs_and_operations, MergeFn, + create_sorter, create_writer, obkvs_keep_last_addition_merge_deletions, + obkvs_merge_additions_and_deletions, MergeFn, }; use super::{IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::{db_name, main_key}; +use crate::update::del_add::into_del_add_obkv; use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep}; use crate::{ FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, BEU32, @@ -106,8 +108,8 @@ impl<'a, 'i> Transform<'a, 'i> { // We must choose the appropriate merge function for when two or more documents // with the same user id must be merged or fully replaced in the same batch. let merge_function = match index_documents_method { - IndexDocumentsMethod::ReplaceDocuments => keep_latest_obkv, - IndexDocumentsMethod::UpdateDocuments => merge_obkvs_and_operations, + IndexDocumentsMethod::ReplaceDocuments => obkvs_keep_last_addition_merge_deletions, + IndexDocumentsMethod::UpdateDocuments => obkvs_merge_additions_and_deletions, }; // We initialize the sorter with the user indexing settings. @@ -223,19 +225,21 @@ impl<'a, 'i> Transform<'a, 'i> { let docid = match self.new_external_documents_ids_builder.entry((*external_id).into()) { Entry::Occupied(entry) => *entry.get() as u32, Entry::Vacant(entry) => { - // If the document was already in the db we mark it as a replaced document. - // It'll be deleted later. - if let Some(docid) = external_documents_ids.get(entry.key()) { - // If it was already in the list of replaced documents it means it was deleted - // by the remove_document method. We should starts as if it never existed. - if self.replaced_documents_ids.insert(docid) { - original_docid = Some(docid); + let docid = match external_documents_ids.get(entry.key()) { + Some(docid) => { + // If it was already in the list of replaced documents it means it was deleted + // by the remove_document method. We should starts as if it never existed. + if self.replaced_documents_ids.insert(docid) { + original_docid = Some(docid); + } + + docid } - } - let docid = self - .available_documents_ids - .next() - .ok_or(UserError::DocumentLimitReached)?; + None => self + .available_documents_ids + .next() + .ok_or(UserError::DocumentLimitReached)?, + }; entry.insert(docid as u64); docid } @@ -263,16 +267,28 @@ impl<'a, 'i> Transform<'a, 'i> { skip_insertion = true; } else { // we associate the base document with the new key, everything will get merged later. + let keep_original_version = + self.index_documents_method == IndexDocumentsMethod::UpdateDocuments; document_sorter_buffer.clear(); document_sorter_buffer.push(Operation::Addition as u8); - document_sorter_buffer.extend_from_slice(base_obkv); + into_del_add_obkv( + KvReaderU16::new(base_obkv), + true, + keep_original_version, + &mut document_sorter_buffer, + )?; self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?; match self.flatten_from_fields_ids_map(KvReader::new(base_obkv))? { Some(flattened_obkv) => { // we recreate our buffer with the flattened documents document_sorter_buffer.clear(); document_sorter_buffer.push(Operation::Addition as u8); - document_sorter_buffer.extend_from_slice(&flattened_obkv); + into_del_add_obkv( + KvReaderU16::new(&flattened_obkv), + true, + keep_original_version, + &mut document_sorter_buffer, + )?; self.flattened_sorter .insert(docid.to_be_bytes(), &document_sorter_buffer)? } @@ -288,7 +304,12 @@ impl<'a, 'i> Transform<'a, 'i> { document_sorter_buffer.clear(); document_sorter_buffer.push(Operation::Addition as u8); - document_sorter_buffer.extend_from_slice(&obkv_buffer); + into_del_add_obkv( + KvReaderU16::new(&obkv_buffer), + false, + true, + &mut document_sorter_buffer, + )?; // We use the extracted/generated user id as the key for this document. self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?; @@ -296,7 +317,12 @@ impl<'a, 'i> Transform<'a, 'i> { Some(flattened_obkv) => { document_sorter_buffer.clear(); document_sorter_buffer.push(Operation::Addition as u8); - document_sorter_buffer.extend_from_slice(&flattened_obkv); + into_del_add_obkv( + KvReaderU16::new(&flattened_obkv), + false, + true, + &mut document_sorter_buffer, + )?; self.flattened_sorter .insert(docid.to_be_bytes(), &document_sorter_buffer)? } @@ -354,19 +380,25 @@ impl<'a, 'i> Transform<'a, 'i> { let external_documents_ids = self.index.external_documents_ids(wtxn)?; let mut documents_deleted = 0; + let mut document_sorter_buffer = Vec::new(); for to_remove in to_remove { if should_abort() { return Err(Error::InternalError(InternalError::AbortedIndexation)); } - match self.new_external_documents_ids_builder.entry((*to_remove).into()) { + // Check if the document has been added in the current indexing process. + let deleted_from_current = match self + .new_external_documents_ids_builder + .entry((*to_remove).into()) + { // if the document was added in a previous iteration of the transform we make it as deleted in the sorters. Entry::Occupied(entry) => { let doc_id = *entry.get() as u32; - self.original_sorter - .insert(doc_id.to_be_bytes(), [Operation::Deletion as u8])?; - self.flattened_sorter - .insert(doc_id.to_be_bytes(), [Operation::Deletion as u8])?; + document_sorter_buffer.clear(); + document_sorter_buffer.push(Operation::Deletion as u8); + obkv::KvWriterU16::new(&mut document_sorter_buffer).finish().unwrap(); + self.original_sorter.insert(doc_id.to_be_bytes(), &document_sorter_buffer)?; + self.flattened_sorter.insert(doc_id.to_be_bytes(), &document_sorter_buffer)?; // we must NOT update the list of replaced_documents_ids // Either: @@ -375,21 +407,69 @@ impl<'a, 'i> Transform<'a, 'i> { // we're removing it there is nothing to do. self.new_documents_ids.remove(doc_id); entry.remove_entry(); + true } - Entry::Vacant(entry) => { - // If the document was already in the db we mark it as a `to_delete` document. - // It'll be deleted later. We don't need to push anything to the sorters. - if let Some(docid) = external_documents_ids.get(entry.key()) { - self.replaced_documents_ids.insert(docid); - } else { - // if the document is nowehere to be found, there is nothing to do and we must NOT - // increment the count of documents_deleted - continue; - } - } + Entry::Vacant(_) => false, }; - documents_deleted += 1; + // If the document was already in the db we mark it as a `to_delete` document. + // Then we push the document in sorters in deletion mode. + let deleted_from_db = match external_documents_ids.get(&to_remove) { + Some(docid) => { + self.replaced_documents_ids.insert(docid); + + // fetch the obkv document + let original_key = BEU32::new(docid); + let base_obkv = self + .index + .documents + .remap_data_type::() + .get(wtxn, &original_key)? + .ok_or(InternalError::DatabaseMissingEntry { + db_name: db_name::DOCUMENTS, + key: None, + })?; + + // push it as to delete in the original_sorter + document_sorter_buffer.clear(); + document_sorter_buffer.push(Operation::Deletion as u8); + into_del_add_obkv( + KvReaderU16::new(base_obkv), + true, + false, + &mut document_sorter_buffer, + )?; + self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?; + + // flatten it and push it as to delete in the flattened_sorter + match self.flatten_from_fields_ids_map(KvReader::new(base_obkv))? { + Some(flattened_obkv) => { + // we recreate our buffer with the flattened documents + document_sorter_buffer.clear(); + document_sorter_buffer.push(Operation::Deletion as u8); + into_del_add_obkv( + KvReaderU16::new(&flattened_obkv), + true, + false, + &mut document_sorter_buffer, + )?; + self.flattened_sorter + .insert(docid.to_be_bytes(), &document_sorter_buffer)? + } + None => self + .flattened_sorter + .insert(docid.to_be_bytes(), &document_sorter_buffer)?, + } + + true + } + None => false, + }; + + // increase counter only if the document existed somewhere before. + if deleted_from_current || deleted_from_db { + documents_deleted += 1; + } } Ok(documents_deleted) @@ -589,9 +669,7 @@ impl<'a, 'i> Transform<'a, 'i> { let mut documents_count = 0; while let Some((key, val)) = iter.next()? { - if val[0] == Operation::Deletion as u8 { - continue; - } + // skip first byte corresponding to the operation type (Deletion or Addition). let val = &val[1..]; // send a callback to show at which step we are @@ -631,9 +709,7 @@ impl<'a, 'i> Transform<'a, 'i> { // We get rids of the `Operation` byte and skip the deleted documents as well. let mut iter = self.flattened_sorter.into_stream_merger_iter()?; while let Some((key, val)) = iter.next()? { - if val[0] == Operation::Deletion as u8 { - continue; - } + // skip first byte corresponding to the operation type (Deletion or Addition). let val = &val[1..]; writer.insert(key, val)?; } @@ -713,6 +789,7 @@ impl<'a, 'i> Transform<'a, 'i> { ); let mut obkv_buffer = Vec::new(); + let mut document_sorter_buffer = Vec::new(); for result in self.index.all_documents(wtxn)? { let (docid, obkv) = result?; @@ -727,7 +804,9 @@ impl<'a, 'i> Transform<'a, 'i> { } let buffer = obkv_writer.into_inner()?; - original_writer.insert(docid.to_be_bytes(), &buffer)?; + document_sorter_buffer.clear(); + into_del_add_obkv(KvReaderU16::new(buffer), true, true, &mut document_sorter_buffer)?; + original_writer.insert(docid.to_be_bytes(), &document_sorter_buffer)?; // Once we have the document. We're going to flatten it // and insert it in the flattened sorter. @@ -762,7 +841,9 @@ impl<'a, 'i> Transform<'a, 'i> { let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?; writer.insert(fid, &value)?; } - flattened_writer.insert(docid.to_be_bytes(), &buffer)?; + document_sorter_buffer.clear(); + into_del_add_obkv(KvReaderU16::new(&buffer), true, true, &mut document_sorter_buffer)?; + flattened_writer.insert(docid.to_be_bytes(), &document_sorter_buffer)?; } // Once we have written all the documents, we extract @@ -828,38 +909,86 @@ mod test { #[test] fn merge_obkvs() { - let mut doc_0 = Vec::new(); - let mut kv_writer = KvWriter::new(&mut doc_0); + let mut additive_doc_0 = Vec::new(); + let mut deletive_doc_0 = Vec::new(); + let mut del_add_doc_0 = Vec::new(); + let mut kv_writer = KvWriter::memory(); kv_writer.insert(0_u8, [0]).unwrap(); - kv_writer.finish().unwrap(); - doc_0.insert(0, Operation::Addition as u8); + let buffer = kv_writer.into_inner().unwrap(); + into_del_add_obkv(KvReaderU16::new(&buffer), false, true, &mut additive_doc_0).unwrap(); + additive_doc_0.insert(0, Operation::Addition as u8); + into_del_add_obkv(KvReaderU16::new(&buffer), true, false, &mut deletive_doc_0).unwrap(); + deletive_doc_0.insert(0, Operation::Deletion as u8); + into_del_add_obkv(KvReaderU16::new(&buffer), true, true, &mut del_add_doc_0).unwrap(); + del_add_doc_0.insert(0, Operation::Addition as u8); - let ret = merge_obkvs_and_operations(&[], &[Cow::from(doc_0.as_slice())]).unwrap(); - assert_eq!(*ret, doc_0); + let mut additive_doc_1 = Vec::new(); + let mut kv_writer = KvWriter::memory(); + kv_writer.insert(1_u8, [1]).unwrap(); + let buffer = kv_writer.into_inner().unwrap(); + into_del_add_obkv(KvReaderU16::new(&buffer), false, true, &mut additive_doc_1).unwrap(); + additive_doc_1.insert(0, Operation::Addition as u8); - let ret = merge_obkvs_and_operations( + let mut additive_doc_0_1 = Vec::new(); + let mut kv_writer = KvWriter::memory(); + kv_writer.insert(0_u8, [0]).unwrap(); + kv_writer.insert(1_u8, [1]).unwrap(); + let buffer = kv_writer.into_inner().unwrap(); + into_del_add_obkv(KvReaderU16::new(&buffer), false, true, &mut additive_doc_0_1).unwrap(); + additive_doc_0_1.insert(0, Operation::Addition as u8); + + let ret = obkvs_merge_additions_and_deletions(&[], &[Cow::from(additive_doc_0.as_slice())]) + .unwrap(); + assert_eq!(*ret, additive_doc_0); + + let ret = obkvs_merge_additions_and_deletions( &[], - &[Cow::from([Operation::Deletion as u8].as_slice()), Cow::from(doc_0.as_slice())], + &[Cow::from(deletive_doc_0.as_slice()), Cow::from(additive_doc_0.as_slice())], ) .unwrap(); - assert_eq!(*ret, doc_0); + assert_eq!(*ret, del_add_doc_0); - let ret = merge_obkvs_and_operations( + let ret = obkvs_merge_additions_and_deletions( &[], - &[Cow::from(doc_0.as_slice()), Cow::from([Operation::Deletion as u8].as_slice())], + &[Cow::from(additive_doc_0.as_slice()), Cow::from(deletive_doc_0.as_slice())], ) .unwrap(); - assert_eq!(*ret, [Operation::Deletion as u8]); + assert_eq!(*ret, deletive_doc_0); - let ret = merge_obkvs_and_operations( + let ret = obkvs_merge_additions_and_deletions( &[], &[ - Cow::from([Operation::Addition as u8, 1].as_slice()), - Cow::from([Operation::Deletion as u8].as_slice()), - Cow::from(doc_0.as_slice()), + Cow::from(additive_doc_1.as_slice()), + Cow::from(deletive_doc_0.as_slice()), + Cow::from(additive_doc_0.as_slice()), ], ) .unwrap(); - assert_eq!(*ret, doc_0); + assert_eq!(*ret, del_add_doc_0); + + let ret = obkvs_merge_additions_and_deletions( + &[], + &[Cow::from(additive_doc_1.as_slice()), Cow::from(additive_doc_0.as_slice())], + ) + .unwrap(); + assert_eq!(*ret, additive_doc_0_1); + + let ret = obkvs_keep_last_addition_merge_deletions( + &[], + &[Cow::from(additive_doc_1.as_slice()), Cow::from(additive_doc_0.as_slice())], + ) + .unwrap(); + assert_eq!(*ret, additive_doc_0); + + let ret = obkvs_keep_last_addition_merge_deletions( + &[], + &[ + Cow::from(deletive_doc_0.as_slice()), + Cow::from(additive_doc_1.as_slice()), + Cow::from(additive_doc_0.as_slice()), + ], + ) + .unwrap(); + assert_eq!(*ret, del_add_doc_0); } } diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 9982957e5..6224995a3 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -21,6 +21,7 @@ pub use self::words_prefixes_fst::WordsPrefixesFst; mod available_documents_ids; mod clear_documents; +pub(crate) mod del_add; mod delete_documents; pub(crate) mod facet; mod index_documents; From 313b16bec28835ef1e921fb967ec881e335f5192 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 16 Oct 2023 14:58:11 +0200 Subject: [PATCH 015/127] Support diff indexing on extract_docid_word_positions --- milli/src/update/del_add.rs | 40 ++ .../extract/extract_docid_word_positions.rs | 369 ++++++++++++------ .../helpers/merge_functions.rs | 6 +- milli/src/update/index_documents/transform.rs | 4 +- .../src/update/index_documents/typed_chunk.rs | 34 +- 5 files changed, 322 insertions(+), 131 deletions(-) diff --git a/milli/src/update/del_add.rs b/milli/src/update/del_add.rs index e8e595837..346ae0afa 100644 --- a/milli/src/update/del_add.rs +++ b/milli/src/update/del_add.rs @@ -58,3 +58,43 @@ pub fn into_del_add_obkv( writer.finish() } + +/// Creates a Kv> from two Kv +/// +/// putting each deletion obkv's keys under an DelAdd::Deletion +/// and putting each addition obkv's keys under an DelAdd::Addition +pub fn del_add_from_two_obkvs( + deletion: obkv::KvReader, + addition: obkv::KvReader, + buffer: &mut Vec, +) -> Result<(), std::io::Error> { + use itertools::merge_join_by; + use itertools::EitherOrBoth::{Both, Left, Right}; + + let mut writer = obkv::KvWriter::new(buffer); + let mut value_buffer = Vec::new(); + + for eob in merge_join_by(deletion.iter(), addition.iter(), |(b, _), (u, _)| b.cmp(u)) { + value_buffer.clear(); + match eob { + Left((k, v)) => { + let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); + value_writer.insert(DelAdd::Deletion, v).unwrap(); + writer.insert(k, value_writer.into_inner()?).unwrap(); + } + Right((k, v)) => { + let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); + value_writer.insert(DelAdd::Addition, v).unwrap(); + writer.insert(k, value_writer.into_inner()?).unwrap(); + } + Both((k, deletion), (_, addition)) => { + let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); + value_writer.insert(DelAdd::Deletion, deletion).unwrap(); + value_writer.insert(DelAdd::Addition, addition).unwrap(); + writer.insert(k, value_writer.into_inner()?).unwrap(); + } + } + } + + writer.finish() +} diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 0c7c5cf46..e02e492d2 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -11,7 +11,7 @@ use serde_json::Value; use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters}; use crate::error::{InternalError, SerializationError}; -use crate::update::index_documents::MergeFn; +use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd}; use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH}; pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), RoaringBitmap>; @@ -30,15 +30,21 @@ pub fn extract_docid_word_positions( allowed_separators: Option<&[&str]>, dictionary: Option<&[&str]>, max_positions_per_attributes: Option, -) -> Result<(RoaringBitmap, grenad::Reader>, ScriptLanguageDocidsMap)> { +) -> Result<( + RoaringBitmap, + grenad::Reader>, + (ScriptLanguageDocidsMap, ScriptLanguageDocidsMap), +)> { puffin::profile_function!(); let max_positions_per_attributes = max_positions_per_attributes .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE)); let max_memory = indexer.max_memory_by_thread(); + // initialize destination values. let mut documents_ids = RoaringBitmap::new(); - let mut script_language_docids = HashMap::new(); + let mut del_script_language_docids = HashMap::new(); + let mut add_script_language_docids = HashMap::new(); let mut docid_word_positions_sorter = create_sorter( grenad::SortAlgorithm::Stable, keep_latest_obkv, @@ -48,7 +54,142 @@ pub fn extract_docid_word_positions( max_memory, ); - let mut buffers = Buffers::default(); + // initialize buffers. + let mut del_buffers = Buffers::default(); + let mut add_buffers = Buffers::default(); + let mut key_buffer = Vec::new(); + let mut value_buffer = Vec::new(); + + // initialize tokenizer. + let mut builder = tokenizer_builder(stop_words, dictionary, allowed_separators, None); + let tokenizer = builder.build(); + + // iterate over documents. + let mut cursor = obkv_documents.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + let document_id = key + .try_into() + .map(u32::from_be_bytes) + .map_err(|_| SerializationError::InvalidNumberSerialization)?; + let obkv = KvReader::::new(value); + + // if the searchable fields didn't change, skip the searchable indexing for this document. + if !searchable_fields_changed(&KvReader::::new(value), searchable_fields) { + continue; + } + + documents_ids.push(document_id); + + // Update key buffer prefix. + key_buffer.clear(); + key_buffer.extend_from_slice(&document_id.to_be_bytes()); + + // Tokenize deletions and additions in 2 diffferent threads. + let (del, add): (Result<_>, Result<_>) = rayon::join( + || { + // deletions + lang_safe_tokens_from_document( + &obkv, + searchable_fields, + &tokenizer, + stop_words, + allowed_separators, + dictionary, + max_positions_per_attributes, + DelAdd::Deletion, + &mut del_buffers, + ) + }, + || { + // additions + lang_safe_tokens_from_document( + &obkv, + searchable_fields, + &tokenizer, + stop_words, + allowed_separators, + dictionary, + max_positions_per_attributes, + DelAdd::Addition, + &mut add_buffers, + ) + }, + ); + + let (del_obkv, del_script_language_word_count) = del?; + let (add_obkv, add_script_language_word_count) = add?; + + // merge deletions and additions. + value_buffer.clear(); + del_add_from_two_obkvs( + KvReader::::new(del_obkv), + KvReader::::new(add_obkv), + &mut value_buffer, + )?; + + // write them into the sorter. + let obkv = KvReader::::new(value); + for (field_id, value) in obkv.iter() { + key_buffer.truncate(mem::size_of::()); + key_buffer.extend_from_slice(&field_id.to_be_bytes()); + docid_word_positions_sorter.insert(&key_buffer, value)?; + } + + // update script_language_docids deletions. + for (script, languages_frequency) in del_script_language_word_count { + for (language, _) in languages_frequency { + let entry = del_script_language_docids + .entry((script, language)) + .or_insert_with(RoaringBitmap::new); + entry.push(document_id); + } + } + + // update script_language_docids additions. + for (script, languages_frequency) in add_script_language_word_count { + for (language, _) in languages_frequency { + let entry = add_script_language_docids + .entry((script, language)) + .or_insert_with(RoaringBitmap::new); + entry.push(document_id); + } + } + } + + let script_language_docids = (del_script_language_docids, add_script_language_docids); + sorter_into_reader(docid_word_positions_sorter, indexer) + .map(|reader| (documents_ids, reader, script_language_docids)) +} + +/// Check if any searchable fields of a document changed. +fn searchable_fields_changed( + obkv: &KvReader, + searchable_fields: &Option>, +) -> bool { + for (field_id, field_bytes) in obkv.iter() { + if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { + let del_add = KvReaderDelAdd::new(field_bytes); + match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) { + // if both fields are None, check the next field. + (None, None) => (), + // if both contains a value and values are the same, check the next field. + (Some(del), Some(add)) if del == add => (), + // otherwise the fields are different, return true. + _otherwise => return true, + } + } + } + + false +} + +/// Factorize tokenizer building. +fn tokenizer_builder<'a>( + stop_words: Option<&'a fst::Set<&[u8]>>, + allowed_separators: Option<&'a [&str]>, + dictionary: Option<&'a [&str]>, + script_language: Option<&'a HashMap>>, +) -> TokenizerBuilder<'a, &'a [u8]> { let mut tokenizer_builder = TokenizerBuilder::new(); if let Some(stop_words) = stop_words { tokenizer_builder.stop_words(stop_words); @@ -59,138 +200,144 @@ pub fn extract_docid_word_positions( if let Some(separators) = allowed_separators { tokenizer_builder.separators(separators); } - let tokenizer = tokenizer_builder.build(); - let mut cursor = obkv_documents.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { - let document_id = key - .try_into() - .map(u32::from_be_bytes) - .map_err(|_| SerializationError::InvalidNumberSerialization)?; - let obkv = KvReader::::new(value); + if let Some(script_language) = script_language { + tokenizer_builder.allow_list(&script_language); + } - documents_ids.push(document_id); - buffers.key_buffer.clear(); - buffers.key_buffer.extend_from_slice(&document_id.to_be_bytes()); + tokenizer_builder +} - let mut script_language_word_count = HashMap::new(); +/// Extract words maped with their positions of a document, +/// ensuring no Language detection mistakes was made. +fn lang_safe_tokens_from_document<'a>( + obkv: &KvReader, + searchable_fields: &Option>, + tokenizer: &Tokenizer, + stop_words: Option<&fst::Set<&[u8]>>, + allowed_separators: Option<&[&str]>, + dictionary: Option<&[&str]>, + max_positions_per_attributes: u32, + del_add: DelAdd, + buffers: &'a mut Buffers, +) -> Result<(&'a [u8], HashMap>)> { + let mut script_language_word_count = HashMap::new(); - extract_tokens_from_document( - &obkv, - searchable_fields, - &tokenizer, - max_positions_per_attributes, - &mut buffers, - &mut script_language_word_count, - &mut docid_word_positions_sorter, - )?; + tokens_from_document( + &obkv, + searchable_fields, + &tokenizer, + max_positions_per_attributes, + del_add, + buffers, + &mut script_language_word_count, + )?; - // if we detect a potetial mistake in the language detection, - // we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages. - // context: https://github.com/meilisearch/meilisearch/issues/3565 - if script_language_word_count - .values() - .map(Vec::as_slice) - .any(potential_language_detection_error) - { - // build an allow list with the most frequent detected languages in the document. - let script_language: HashMap<_, _> = - script_language_word_count.iter().filter_map(most_frequent_languages).collect(); + // if we detect a potetial mistake in the language detection, + // we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages. + // context: https://github.com/meilisearch/meilisearch/issues/3565 + if script_language_word_count + .values() + .map(Vec::as_slice) + .any(potential_language_detection_error) + { + // build an allow list with the most frequent detected languages in the document. + let script_language: HashMap<_, _> = + script_language_word_count.iter().filter_map(most_frequent_languages).collect(); - // if the allow list is empty, meaning that no Language is considered frequent, - // then we don't rerun the extraction. - if !script_language.is_empty() { - // build a new temporary tokenizer including the allow list. - let mut tokenizer_builder = TokenizerBuilder::new(); - if let Some(stop_words) = stop_words { - tokenizer_builder.stop_words(stop_words); - } - if let Some(dictionary) = dictionary { - tokenizer_builder.words_dict(dictionary); - } - if let Some(separators) = allowed_separators { - tokenizer_builder.separators(separators); - } - tokenizer_builder.allow_list(&script_language); - let tokenizer = tokenizer_builder.build(); + // if the allow list is empty, meaning that no Language is considered frequent, + // then we don't rerun the extraction. + if !script_language.is_empty() { + // build a new temporary tokenizer including the allow list. + let mut builder = tokenizer_builder( + stop_words, + dictionary, + allowed_separators, + Some(&script_language), + ); + let tokenizer = builder.build(); - script_language_word_count.clear(); + script_language_word_count.clear(); - // rerun the extraction. - extract_tokens_from_document( - &obkv, - searchable_fields, - &tokenizer, - max_positions_per_attributes, - &mut buffers, - &mut script_language_word_count, - &mut docid_word_positions_sorter, - )?; - } - } - - for (script, languages_frequency) in script_language_word_count { - for (language, _) in languages_frequency { - let entry = script_language_docids - .entry((script, language)) - .or_insert_with(RoaringBitmap::new); - entry.push(document_id); - } + // rerun the extraction. + tokens_from_document( + &obkv, + searchable_fields, + &tokenizer, + max_positions_per_attributes, + del_add, + buffers, + &mut script_language_word_count, + )?; } } - sorter_into_reader(docid_word_positions_sorter, indexer) - .map(|reader| (documents_ids, reader, script_language_docids)) + Ok((&buffers.obkv_buffer, script_language_word_count)) } -fn extract_tokens_from_document( +/// Extract words maped with their positions of a document. +fn tokens_from_document<'a>( obkv: &KvReader, searchable_fields: &Option>, tokenizer: &Tokenizer, max_positions_per_attributes: u32, - buffers: &mut Buffers, + del_add: DelAdd, + buffers: &'a mut Buffers, script_language_word_count: &mut HashMap>, - docid_word_positions_sorter: &mut grenad::Sorter, -) -> Result<()> { +) -> Result<&'a [u8]> { + buffers.obkv_buffer.clear(); + let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer); for (field_id, field_bytes) in obkv.iter() { + // if field is searchable. if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { - let value = serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; - buffers.field_buffer.clear(); - if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) { - let tokens = process_tokens(tokenizer.tokenize(field)) - .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); + // extract deletion or addition only. + if let Some(field_bytes) = KvReaderDelAdd::new(field_bytes).get(del_add) { + // parse json. + let value = + serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; - buffers.obkv_buffer.clear(); - let mut writer = KvWriterU16::new(&mut buffers.obkv_buffer); - for (index, token) in tokens { - // if a language has been detected for the token, we update the counter. - if let Some(language) = token.language { - let script = token.script; - let entry = - script_language_word_count.entry(script).or_insert_with(Vec::new); - match entry.iter_mut().find(|(l, _)| *l == language) { - Some((_, n)) => *n += 1, - None => entry.push((language, 1)), + // prepare writting destination. + buffers.obkv_positions_buffer.clear(); + let mut writer = KvWriterU16::new(&mut buffers.obkv_positions_buffer); + + // convert json into an unique string. + buffers.field_buffer.clear(); + if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) { + // create an iterator of token with their positions. + let tokens = process_tokens(tokenizer.tokenize(field)) + .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); + + for (index, token) in tokens { + // if a language has been detected for the token, we update the counter. + if let Some(language) = token.language { + let script = token.script; + let entry = + script_language_word_count.entry(script).or_insert_with(Vec::new); + match entry.iter_mut().find(|(l, _)| *l == language) { + Some((_, n)) => *n += 1, + None => entry.push((language, 1)), + } + } + + // keep a word only if it is not empty and fit in a LMDB key. + let token = token.lemma().trim(); + if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { + let position: u16 = index + .try_into() + .map_err(|_| SerializationError::InvalidNumberSerialization)?; + writer.insert(position, token.as_bytes())?; } } - let token = token.lemma().trim(); - if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { - let position: u16 = index - .try_into() - .map_err(|_| SerializationError::InvalidNumberSerialization)?; - writer.insert(position, token.as_bytes())?; - } - } - let positions = writer.into_inner()?; - buffers.key_buffer.truncate(mem::size_of::()); - buffers.key_buffer.extend_from_slice(&field_id.to_be_bytes()); - docid_word_positions_sorter.insert(&buffers.key_buffer, positions)?; + // write positions into document. + let positions = writer.into_inner()?; + document_writer.insert(field_id, positions)?; + } } } } - Ok(()) + Ok(document_writer.into_inner().map(|v| v.as_slice())?) } /// Transform a JSON value into a string that can be indexed. @@ -293,12 +440,10 @@ fn compute_language_frequency_threshold(languages_frequency: &[(Language, usize) #[derive(Default)] struct Buffers { - // the key buffer is the concatenation of the internal document id with the field id. - // The buffer has to be completelly cleared between documents, - // and the field id part must be cleared between each field. - key_buffer: Vec, // the field buffer for each fields desserialization, and must be cleared between each field. field_buffer: String, // buffer used to store the value data containing an obkv. obkv_buffer: Vec, + // buffer used to store the value data containing an obkv of tokens with their positions. + obkv_positions_buffer: Vec, } diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs index 6317b5610..dee200b21 100644 --- a/milli/src/update/index_documents/helpers/merge_functions.rs +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -117,8 +117,9 @@ pub fn merge_two_del_add_obkvs( let update_reader = KvReaderDelAdd::new(update); // keep newest deletion. - if let Some(deletion) = - update_reader.get(DelAdd::Deletion).or(base_reader.get(DelAdd::Deletion)) + if let Some(deletion) = update_reader + .get(DelAdd::Deletion) + .or_else(|| base_reader.get(DelAdd::Deletion)) { value_writer.insert(DelAdd::Deletion, deletion).unwrap(); } @@ -127,6 +128,7 @@ pub fn merge_two_del_add_obkvs( let base_addition = merge_additions.then(|| base_reader.get(DelAdd::Addition)).flatten(); // keep newest addition. + // TODO use or_else if let Some(addition) = update_reader.get(DelAdd::Addition).or(base_addition) { value_writer.insert(DelAdd::Addition, addition).unwrap(); } diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index a45a6ee3c..2b77768cb 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -805,7 +805,7 @@ impl<'a, 'i> Transform<'a, 'i> { let buffer = obkv_writer.into_inner()?; document_sorter_buffer.clear(); - into_del_add_obkv(KvReaderU16::new(buffer), true, true, &mut document_sorter_buffer)?; + into_del_add_obkv(KvReaderU16::new(buffer), false, true, &mut document_sorter_buffer)?; original_writer.insert(docid.to_be_bytes(), &document_sorter_buffer)?; // Once we have the document. We're going to flatten it @@ -842,7 +842,7 @@ impl<'a, 'i> Transform<'a, 'i> { writer.insert(fid, &value)?; } document_sorter_buffer.clear(); - into_del_add_obkv(KvReaderU16::new(&buffer), true, true, &mut document_sorter_buffer)?; + into_del_add_obkv(KvReaderU16::new(&buffer), false, true, &mut document_sorter_buffer)?; flattened_writer.insert(docid.to_be_bytes(), &document_sorter_buffer)?; } diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index a94bcf581..f2dc7d336 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -43,7 +43,9 @@ pub(crate) enum TypedChunk { FieldIdFacetIsEmptyDocids(grenad::Reader>), GeoPoints(grenad::Reader>), VectorPoints(grenad::Reader>), - ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>), + ScriptLanguageDocids( + (HashMap<(Script, Language), RoaringBitmap>, HashMap<(Script, Language), RoaringBitmap>), + ), } impl TypedChunk { @@ -101,8 +103,8 @@ impl TypedChunk { TypedChunk::VectorPoints(grenad) => { format!("VectorPoints {{ number_of_entries: {} }}", grenad.len()) } - TypedChunk::ScriptLanguageDocids(grenad) => { - format!("ScriptLanguageDocids {{ number_of_entries: {} }}", grenad.len()) + TypedChunk::ScriptLanguageDocids((_, addition)) => { + format!("ScriptLanguageDocids {{ number_of_entries: {} }}", addition.len()) } } } @@ -344,19 +346,21 @@ pub(crate) fn write_typed_chunk_into_index( log::debug!("There are {} entries in the HNSW so far", hnsw_length); index.put_vector_hnsw(wtxn, &new_hnsw)?; } - TypedChunk::ScriptLanguageDocids(hash_pair) => { - let mut buffer = Vec::new(); - for (key, value) in hash_pair { - buffer.clear(); - let final_value = match index.script_language_docids.get(wtxn, &key)? { - Some(db_values) => { - let mut db_value_buffer = Vec::new(); - serialize_roaring_bitmap(&db_values, &mut db_value_buffer)?; - let mut new_value_buffer = Vec::new(); - serialize_roaring_bitmap(&value, &mut new_value_buffer)?; - merge_roaring_bitmaps(&new_value_buffer, &db_value_buffer, &mut buffer)?; - RoaringBitmap::deserialize_from(&buffer[..])? + TypedChunk::ScriptLanguageDocids((deletion, addition)) => { + for (key, value) in deletion { + if let Some(mut db_values) = index.script_language_docids.get(wtxn, &key)? { + db_values -= value; + if db_values.is_empty() { + index.script_language_docids.delete(wtxn, &key)?; + } else { + index.script_language_docids.put(wtxn, &key, &db_values)?; } + } + } + + for (key, value) in addition { + let final_value = match index.script_language_docids.get(wtxn, &key)? { + Some(mut db_values) => db_values | value, None => value, }; index.script_language_docids.put(wtxn, &key, &final_value)?; From 0c47defeee739ec7b2528c4993d8e11bad08d34b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 17 Oct 2023 18:09:41 +0200 Subject: [PATCH 016/127] Work on fid docid facet values rewrite --- milli/src/update/facet/bulk.rs | 2 + milli/src/update/facet/mod.rs | 1 + .../extract/extract_facet_number_docids.rs | 4 + .../extract/extract_facet_string_docids.rs | 4 + .../extract/extract_fid_docid_facet_values.rs | 276 +++++++++++++++--- 5 files changed, 249 insertions(+), 38 deletions(-) diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index a3f0c8f71..a2b1c9dcd 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -133,6 +133,8 @@ impl FacetsUpdateBulkInner { self.db.delete_range(wtxn, &range).map(drop)?; Ok(()) } + + // TODO the new_data is an Reader>> fn update_level0(&mut self, wtxn: &mut RwTxn) -> Result<()> { let new_data = match self.new_data.take() { Some(x) => x, diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index bbd25f91e..decb6a9ac 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -115,6 +115,7 @@ pub struct FacetsUpdate<'i> { min_level_size: u8, } impl<'i> FacetsUpdate<'i> { + // TODO grenad::Reader> pub fn new( index: &'i Index, facet_type: FacetType, diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs index d557e0b6c..76dc6d3c6 100644 --- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs @@ -17,6 +17,7 @@ use crate::Result; /// documents ids from the given chunk of docid facet number positions. #[logging_timer::time] pub fn extract_facet_number_docids( + // TODO Reader> docid_fid_facet_number: grenad::Reader, indexer: GrenadParameters, ) -> Result>> { @@ -26,6 +27,7 @@ pub fn extract_facet_number_docids( let mut facet_number_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, + // TODO We must modify the merger to do unions of Del and Add separately merge_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, @@ -34,12 +36,14 @@ pub fn extract_facet_number_docids( ); let mut cursor = docid_fid_facet_number.into_cursor()?; + // TODO the value is a Obkv and must be taken into account while let Some((key_bytes, _)) = cursor.move_on_next()? { let (field_id, document_id, number) = FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap(); let key = FacetGroupKey { field_id, level: 0, left_bound: number }; let key_bytes = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); + // TODO We must put a Obkv facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; } diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index b1b27449e..b861c04e4 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -15,6 +15,7 @@ use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH}; /// documents ids from the given chunk of docid facet string positions. #[logging_timer::time] pub fn extract_facet_string_docids( + // TODO Reader> docid_fid_facet_string: grenad::Reader, indexer: GrenadParameters, ) -> Result>> { @@ -24,6 +25,7 @@ pub fn extract_facet_string_docids( let mut facet_string_docids_sorter = create_sorter( grenad::SortAlgorithm::Stable, + // TODO We must modify the merger to do unions of Del and Add separately merge_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, @@ -33,6 +35,7 @@ pub fn extract_facet_string_docids( let mut cursor = docid_fid_facet_string.into_cursor()?; while let Some((key, _original_value_bytes)) = cursor.move_on_next()? { + // TODO the value is a Obkv and must be taken into account let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); let field_id = FieldId::from_be_bytes(field_id_bytes); @@ -54,6 +57,7 @@ pub fn extract_facet_string_docids( let key = FacetGroupKey { field_id, level: 0, left_bound: normalised_value }; let key_bytes = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); // document id is encoded in native-endian because of the CBO roaring bitmap codec + // TODO Reader> facet_string_docids_sorter.insert(&key_bytes, document_id.to_ne_bytes())?; } diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 42c355323..0340fb709 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -6,17 +6,21 @@ use std::mem::size_of; use heed::zerocopy::AsBytes; use heed::BytesEncode; +use itertools::EitherOrBoth; +use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use serde_json::{from_slice, Value}; use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; use crate::error::InternalError; use crate::facet::value_encoding::f64_into_bytes; +use crate::update::del_add::{DelAdd, KvWriterDelAdd}; use crate::update::index_documents::{create_writer, writer_into_reader}; use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH}; /// The extracted facet values stored in grenad files by type. pub struct ExtractedFacetValues { + // TOOD rename into `fid_docid_*` pub docid_fid_facet_numbers_chunk: grenad::Reader>, pub docid_fid_facet_strings_chunk: grenad::Reader>, pub fid_facet_is_null_docids_chunk: grenad::Reader>, @@ -31,6 +35,7 @@ pub struct ExtractedFacetValues { /// We need the fid of the geofields to correctly parse them as numbers if they were sent as strings initially. #[logging_timer::time] pub fn extract_fid_docid_facet_values( + // TODO Reader>> obkv_documents: grenad::Reader, indexer: GrenadParameters, faceted_fields: &HashSet, @@ -58,13 +63,15 @@ pub fn extract_fid_docid_facet_values( max_memory.map(|m| m / 2), ); - let mut facet_exists_docids = BTreeMap::::new(); - let mut facet_is_null_docids = BTreeMap::::new(); - let mut facet_is_empty_docids = BTreeMap::::new(); + // The tuples represents the Del and Add side for a bitmap + let mut facet_exists_docids = BTreeMap::::new(); + let mut facet_is_null_docids = BTreeMap::::new(); + let mut facet_is_empty_docids = BTreeMap::::new(); let mut key_buffer = Vec::new(); let mut cursor = obkv_documents.into_cursor()?; while let Some((docid_bytes, value)) = cursor.move_on_next()? { + // TODO Obkv> let obkv = obkv::KvReader::new(value); for (field_id, field_bytes) in obkv.iter() { @@ -79,50 +86,233 @@ pub fn extract_fid_docid_facet_values( let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap(); let document = BEU32::from(document).get(); - facet_exists_docids.entry(field_id).or_default().insert(document); - // For the other extraction tasks, prefix the key with the field_id and the document_id key_buffer.extend_from_slice(docid_bytes); - let value = from_slice(field_bytes).map_err(InternalError::SerdeJson)?; + let del_add_obkv = obkv::KvReader::new(field_bytes); + let del_value = match del_add_obkv.get(DelAdd::Deletion) { + Some(bytes) => from_slice(bytes).map_err(InternalError::SerdeJson)?, + None => None, + }; + let add_value = match del_add_obkv.get(DelAdd::Addition) { + Some(bytes) => from_slice(bytes).map_err(InternalError::SerdeJson)?, + None => None, + }; - match extract_facet_values( - &value, - geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng), - ) { - FilterableValues::Null => { - facet_is_null_docids.entry(field_id).or_default().insert(document); - } - FilterableValues::Empty => { - facet_is_empty_docids.entry(field_id).or_default().insert(document); - } - FilterableValues::Values { numbers, strings } => { - // insert facet numbers in sorter - for number in numbers { - key_buffer.truncate(size_of::() + size_of::()); - if let Some(value_bytes) = f64_into_bytes(number) { - key_buffer.extend_from_slice(&value_bytes); - key_buffer.extend_from_slice(&number.to_be_bytes()); + // We insert the document id on the Del and the Add side if the field exists. + let (mut del_exists, mut add_exists) = + facet_exists_docids.entry(field_id).or_default(); + if del_value.is_some() { + del_exists.insert(document); + } + if add_value.is_some() { + add_exists.insert(document); + } - fid_docid_facet_numbers_sorter - .insert(&key_buffer, ().as_bytes())?; + // TODO extract both Del and Add numbers an strings (dedup) + // TODO use the `itertools::merge_join_by` method to sort and diff both sides (Del and Add) + // TODO if there is a Left generate a Del + // TODO if there is a Right generate an Add + // TODO if there is a Both don't insert + // TODO compare numbers using OrderedFloat and strings using both normalized and original values. + + let geo_support = + geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng); + + let del_filterable_values = + del_value.map(|value| extract_facet_values(&value, geo_support)); + let add_filterable_values = + add_value.map(|value| extract_facet_values(&value, geo_support)); + + use FilterableValues::{Empty, Null, Values}; + + match (del_filterable_values, add_filterable_values) { + (None, None) => (), + (Some(del_filterable_values), None) => match del_filterable_values { + Null => { + let (mut del_is_null, _) = + facet_is_null_docids.entry(field_id).or_default(); + del_is_null.insert(document); + } + Empty => { + let (mut del_is_empty, _) = + facet_is_empty_docids.entry(field_id).or_default(); + del_is_empty.insert(document); + } + Values { numbers, strings } => { + // insert facet numbers in sorter + for number in numbers { + key_buffer.truncate(size_of::() + size_of::()); + if let Some(value_bytes) = f64_into_bytes(number) { + key_buffer.extend_from_slice(&value_bytes); + key_buffer.extend_from_slice(&number.to_be_bytes()); + + // We insert only the Del part of the Obkv to inform + // that we only want to remove all those numbers. + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Deletion, ().as_bytes())?; + let bytes = obkv.into_inner()?; + fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?; + } + } + + // insert normalized and original facet string in sorter + for (normalized, original) in + strings.into_iter().filter(|(n, _)| !n.is_empty()) + { + let normalized_truncated_value: String = normalized + .char_indices() + .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH) + .map(|(_, c)| c) + .collect(); + + key_buffer.truncate(size_of::() + size_of::()); + key_buffer.extend_from_slice(normalized_truncated_value.as_bytes()); + + // We insert only the Del part of the Obkv to inform + // that we only want to remove all those strings. + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Deletion, original.as_bytes())?; + let bytes = obkv.into_inner()?; + fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?; } } + }, + (None, Some(add_filterable_values)) => { + todo!() + } + (Some(del_filterable_values), Some(add_filterable_values)) => { + let (mut del_is_null, mut add_is_null) = + facet_is_null_docids.entry(field_id).or_default(); + let (mut del_is_empty, mut add_is_empty) = + facet_is_empty_docids.entry(field_id).or_default(); - // insert normalized and original facet string in sorter - for (normalized, original) in - strings.into_iter().filter(|(n, _)| !n.is_empty()) - { - let normalized_truncated_value: String = normalized - .char_indices() - .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH) - .map(|(_, c)| c) - .collect(); + match (del_filterable_values, add_filterable_values) { + (Null, Null) | (Empty, Empty) => (), + (Null, Empty) => { + del_is_null.insert(document); + add_is_empty.insert(document); + } + (Empty, Null) => { + del_is_empty.insert(document); + add_is_null.insert(document); + } + (Null, Values { numbers, strings }) => { + del_is_null.insert(document); + todo!() + } + (Empty, Values { numbers, strings }) => { + del_is_empty.insert(document); + todo!() + } + (Values { numbers, strings }, Null) => { + todo!(); + add_is_null.insert(document); + } + (Values { numbers, strings }, Empty) => { + todo!(); + add_is_empty.insert(document); + } + ( + Values { numbers: mut del_numbers, strings: mut del_strings }, + Values { numbers: mut add_numbers, strings: mut add_strings }, + ) => { + // We sort and dedup the float numbers + del_numbers.sort_unstable_by_key(|f| OrderedFloat(*f)); + add_numbers.sort_unstable_by_key(|f| OrderedFloat(*f)); + del_numbers.dedup_by_key(|f| OrderedFloat(*f)); + add_numbers.dedup_by_key(|f| OrderedFloat(*f)); - key_buffer.truncate(size_of::() + size_of::()); - key_buffer.extend_from_slice(normalized_truncated_value.as_bytes()); - fid_docid_facet_strings_sorter - .insert(&key_buffer, original.as_bytes())?; + let merged_numbers_iter = itertools::merge_join_by( + del_numbers.into_iter().map(OrderedFloat), + add_numbers.into_iter().map(OrderedFloat), + |del, add| del.cmp(&add), + ); + + // insert facet numbers in sorter + for eob in merged_numbers_iter { + key_buffer + .truncate(size_of::() + size_of::()); + match eob { + EitherOrBoth::Both(_, _) => (), // no need to touch anything + EitherOrBoth::Left(OrderedFloat(number)) => { + if let Some(value_bytes) = f64_into_bytes(number) { + key_buffer.extend_from_slice(&value_bytes); + key_buffer.extend_from_slice(&number.to_be_bytes()); + + // We insert only the Del part of the Obkv to inform + // that we only want to remove all those numbers. + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Deletion, ().as_bytes())?; + let bytes = obkv.into_inner()?; + fid_docid_facet_numbers_sorter + .insert(&key_buffer, bytes)?; + } + } + EitherOrBoth::Right(OrderedFloat(number)) => { + if let Some(value_bytes) = f64_into_bytes(number) { + key_buffer.extend_from_slice(&value_bytes); + key_buffer.extend_from_slice(&number.to_be_bytes()); + + // We insert only the Del part of the Obkv to inform + // that we only want to remove all those numbers. + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Addition, ().as_bytes())?; + let bytes = obkv.into_inner()?; + fid_docid_facet_numbers_sorter + .insert(&key_buffer, bytes)?; + } + } + } + } + + // We sort and dedup the normalized and original strings + del_strings.sort_unstable(); + add_strings.sort_unstable(); + del_strings.dedup(); + add_strings.dedup(); + + let merged_strings_iter = itertools::merge_join_by( + del_strings.into_iter().filter(|(n, _)| !n.is_empty()), + add_strings.into_iter().filter(|(n, _)| !n.is_empty()), + |del, add| del.cmp(&add), + ); + + // insert normalized and original facet string in sorter + for eob in merged_strings_iter { + match eob { + EitherOrBoth::Both(_, _) => (), // no need to touch anything + EitherOrBoth::Left((normalized, original)) => { + let truncated = truncate_string(normalized); + + key_buffer.truncate( + size_of::() + size_of::(), + ); + key_buffer.extend_from_slice(truncated.as_bytes()); + + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Deletion, original)?; + let bytes = obkv.into_inner()?; + fid_docid_facet_strings_sorter + .insert(&key_buffer, bytes)?; + } + EitherOrBoth::Right((normalized, original)) => { + let truncated = truncate_string(normalized); + + key_buffer.truncate( + size_of::() + size_of::(), + ); + key_buffer.extend_from_slice(truncated.as_bytes()); + + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Addition, original)?; + let bytes = obkv.into_inner()?; + fid_docid_facet_strings_sorter + .insert(&key_buffer, bytes)?; + } + } + } + } } } } @@ -135,6 +325,7 @@ pub fn extract_fid_docid_facet_values( indexer.chunk_compression_level, tempfile::tempfile()?, ); + // TODO generate an Obkv for (fid, bitmap) in facet_exists_docids.into_iter() { let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap(); facet_exists_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?; @@ -146,12 +337,14 @@ pub fn extract_fid_docid_facet_values( indexer.chunk_compression_level, tempfile::tempfile()?, ); + // TODO generate an Obkv for (fid, bitmap) in facet_is_null_docids.into_iter() { let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap(); facet_is_null_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?; } let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?; + // TODO generate an Obkv let mut facet_is_empty_docids_writer = create_writer( indexer.chunk_compression_type, indexer.chunk_compression_level, @@ -243,3 +436,10 @@ fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues { } } } + +fn truncate_string(mut s: String) -> String { + s.char_indices() + .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH) + .map(|(_, c)| c) + .collect() +} From 6ae4100f0720ec1973ed73ad53719f3e74c88aac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 17 Oct 2023 18:15:14 +0200 Subject: [PATCH 017/127] Generate the DelAdd for is_null, is_empty, and exists --- .../extract/extract_fid_docid_facet_values.rs | 36 ++++++++++++------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 0340fb709..e8d70bf0d 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -325,10 +325,14 @@ pub fn extract_fid_docid_facet_values( indexer.chunk_compression_level, tempfile::tempfile()?, ); - // TODO generate an Obkv - for (fid, bitmap) in facet_exists_docids.into_iter() { - let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap(); - facet_exists_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?; + for (fid, (del_bitmap, add_bitmap)) in facet_exists_docids.into_iter() { + let mut obkv = KvWriterDelAdd::memory(); + let del_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&del_bitmap).unwrap(); + let add_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&add_bitmap).unwrap(); + obkv.insert(DelAdd::Deletion, del_bitmap_bytes)?; + obkv.insert(DelAdd::Addition, add_bitmap_bytes)?; + let bytes = obkv.into_inner()?; + facet_exists_docids_writer.insert(fid.to_be_bytes(), &bytes)?; } let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?; @@ -337,22 +341,30 @@ pub fn extract_fid_docid_facet_values( indexer.chunk_compression_level, tempfile::tempfile()?, ); - // TODO generate an Obkv - for (fid, bitmap) in facet_is_null_docids.into_iter() { - let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap(); - facet_is_null_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?; + for (fid, (del_bitmap, add_bitmap)) in facet_is_null_docids.into_iter() { + let mut obkv = KvWriterDelAdd::memory(); + let del_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&del_bitmap).unwrap(); + let add_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&add_bitmap).unwrap(); + obkv.insert(DelAdd::Deletion, del_bitmap_bytes)?; + obkv.insert(DelAdd::Addition, add_bitmap_bytes)?; + let bytes = obkv.into_inner()?; + facet_is_null_docids_writer.insert(fid.to_be_bytes(), &bytes)?; } let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?; - // TODO generate an Obkv let mut facet_is_empty_docids_writer = create_writer( indexer.chunk_compression_type, indexer.chunk_compression_level, tempfile::tempfile()?, ); - for (fid, bitmap) in facet_is_empty_docids.into_iter() { - let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap(); - facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?; + for (fid, (del_bitmap, add_bitmap)) in facet_is_empty_docids.into_iter() { + let mut obkv = KvWriterDelAdd::memory(); + let del_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&del_bitmap).unwrap(); + let add_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&add_bitmap).unwrap(); + obkv.insert(DelAdd::Deletion, del_bitmap_bytes)?; + obkv.insert(DelAdd::Addition, add_bitmap_bytes)?; + let bytes = obkv.into_inner()?; + facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &bytes)?; } let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?; From bc45c1206d01654f6272ffac5cdbfa76aeaa7930 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 18 Oct 2023 11:01:02 +0200 Subject: [PATCH 018/127] Implement all the facet extraction paths and simplify them --- .../extract/extract_fid_docid_facet_values.rs | 404 +++++++++--------- 1 file changed, 212 insertions(+), 192 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index e8d70bf0d..ec0960b86 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -1,22 +1,31 @@ +use std::borrow::Cow; use std::collections::{BTreeMap, HashSet}; use std::convert::TryInto; use std::fs::File; use std::io::{self, BufReader}; use std::mem::size_of; +use std::result::Result as StdResult; +use grenad::Sorter; use heed::zerocopy::AsBytes; use heed::BytesEncode; use itertools::EitherOrBoth; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use serde_json::{from_slice, Value}; +use FilterableValues::{Empty, Null, Values}; use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; use crate::error::InternalError; use crate::facet::value_encoding::f64_into_bytes; use crate::update::del_add::{DelAdd, KvWriterDelAdd}; use crate::update::index_documents::{create_writer, writer_into_reader}; -use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH}; +use crate::{ + CboRoaringBitmapCodec, DocumentId, Error, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH, +}; + +/// The length of the elements that are always in the buffer when inserting new values. +const TRUNCATE_SIZE: usize = size_of::() + size_of::(); /// The extracted facet values stored in grenad files by type. pub struct ExtractedFacetValues { @@ -68,7 +77,10 @@ pub fn extract_fid_docid_facet_values( let mut facet_is_null_docids = BTreeMap::::new(); let mut facet_is_empty_docids = BTreeMap::::new(); - let mut key_buffer = Vec::new(); + // We create two buffer for mutable ref issues with closures. + let mut numbers_key_buffer = Vec::new(); + let mut strings_key_buffer = Vec::new(); + let mut cursor = obkv_documents.into_cursor()?; while let Some((docid_bytes, value)) = cursor.move_on_next()? { // TODO Obkv> @@ -76,18 +88,21 @@ pub fn extract_fid_docid_facet_values( for (field_id, field_bytes) in obkv.iter() { if faceted_fields.contains(&field_id) { - key_buffer.clear(); + numbers_key_buffer.clear(); + strings_key_buffer.clear(); // Set key to the field_id // Note: this encoding is consistent with FieldIdCodec - key_buffer.extend_from_slice(&field_id.to_be_bytes()); + numbers_key_buffer.extend_from_slice(&field_id.to_be_bytes()); + strings_key_buffer.extend_from_slice(&field_id.to_be_bytes()); // Here, we know already that the document must be added to the “field id exists” database let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap(); let document = BEU32::from(document).get(); // For the other extraction tasks, prefix the key with the field_id and the document_id - key_buffer.extend_from_slice(docid_bytes); + numbers_key_buffer.extend_from_slice(docid_bytes); + strings_key_buffer.extend_from_slice(docid_bytes); let del_add_obkv = obkv::KvReader::new(field_bytes); let del_value = match del_add_obkv.get(DelAdd::Deletion) { @@ -100,8 +115,13 @@ pub fn extract_fid_docid_facet_values( }; // We insert the document id on the Del and the Add side if the field exists. - let (mut del_exists, mut add_exists) = + let (ref mut del_exists, ref mut add_exists) = facet_exists_docids.entry(field_id).or_default(); + let (ref mut del_is_null, ref mut add_is_null) = + facet_is_null_docids.entry(field_id).or_default(); + let (ref mut del_is_empty, ref mut add_is_empty) = + facet_is_empty_docids.entry(field_id).or_default(); + if del_value.is_some() { del_exists.insert(document); } @@ -109,84 +129,58 @@ pub fn extract_fid_docid_facet_values( add_exists.insert(document); } - // TODO extract both Del and Add numbers an strings (dedup) - // TODO use the `itertools::merge_join_by` method to sort and diff both sides (Del and Add) - // TODO if there is a Left generate a Del - // TODO if there is a Right generate an Add - // TODO if there is a Both don't insert - // TODO compare numbers using OrderedFloat and strings using both normalized and original values. - let geo_support = geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng); - let del_filterable_values = del_value.map(|value| extract_facet_values(&value, geo_support)); let add_filterable_values = add_value.map(|value| extract_facet_values(&value, geo_support)); - use FilterableValues::{Empty, Null, Values}; + // Those closures are just here to simplify things a bit. + let mut insert_numbers_diff = |del_numbers, add_numbers| { + insert_numbers_diff( + &mut fid_docid_facet_numbers_sorter, + &mut numbers_key_buffer, + del_numbers, + add_numbers, + ) + }; + let mut insert_strings_diff = |del_strings, add_strings| { + insert_strings_diff( + &mut fid_docid_facet_strings_sorter, + &mut strings_key_buffer, + del_strings, + add_strings, + ) + }; match (del_filterable_values, add_filterable_values) { (None, None) => (), (Some(del_filterable_values), None) => match del_filterable_values { Null => { - let (mut del_is_null, _) = - facet_is_null_docids.entry(field_id).or_default(); del_is_null.insert(document); } Empty => { - let (mut del_is_empty, _) = - facet_is_empty_docids.entry(field_id).or_default(); del_is_empty.insert(document); } Values { numbers, strings } => { - // insert facet numbers in sorter - for number in numbers { - key_buffer.truncate(size_of::() + size_of::()); - if let Some(value_bytes) = f64_into_bytes(number) { - key_buffer.extend_from_slice(&value_bytes); - key_buffer.extend_from_slice(&number.to_be_bytes()); - - // We insert only the Del part of the Obkv to inform - // that we only want to remove all those numbers. - let mut obkv = KvWriterDelAdd::memory(); - obkv.insert(DelAdd::Deletion, ().as_bytes())?; - let bytes = obkv.into_inner()?; - fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?; - } - } - - // insert normalized and original facet string in sorter - for (normalized, original) in - strings.into_iter().filter(|(n, _)| !n.is_empty()) - { - let normalized_truncated_value: String = normalized - .char_indices() - .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH) - .map(|(_, c)| c) - .collect(); - - key_buffer.truncate(size_of::() + size_of::()); - key_buffer.extend_from_slice(normalized_truncated_value.as_bytes()); - - // We insert only the Del part of the Obkv to inform - // that we only want to remove all those strings. - let mut obkv = KvWriterDelAdd::memory(); - obkv.insert(DelAdd::Deletion, original.as_bytes())?; - let bytes = obkv.into_inner()?; - fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?; - } + insert_numbers_diff(numbers, vec![])?; + insert_strings_diff(strings, vec![])?; + } + }, + (None, Some(add_filterable_values)) => match add_filterable_values { + Null => { + add_is_null.insert(document); + } + Empty => { + add_is_empty.insert(document); + } + Values { numbers, strings } => { + insert_numbers_diff(vec![], numbers)?; + insert_strings_diff(vec![], strings)?; } }, - (None, Some(add_filterable_values)) => { - todo!() - } (Some(del_filterable_values), Some(add_filterable_values)) => { - let (mut del_is_null, mut add_is_null) = - facet_is_null_docids.entry(field_id).or_default(); - let (mut del_is_empty, mut add_is_empty) = - facet_is_empty_docids.entry(field_id).or_default(); - match (del_filterable_values, add_filterable_values) { (Null, Null) | (Empty, Empty) => (), (Null, Empty) => { @@ -198,120 +192,31 @@ pub fn extract_fid_docid_facet_values( add_is_null.insert(document); } (Null, Values { numbers, strings }) => { + insert_numbers_diff(vec![], numbers)?; + insert_strings_diff(vec![], strings)?; del_is_null.insert(document); - todo!() } (Empty, Values { numbers, strings }) => { + insert_numbers_diff(vec![], numbers)?; + insert_strings_diff(vec![], strings)?; del_is_empty.insert(document); - todo!() } (Values { numbers, strings }, Null) => { - todo!(); add_is_null.insert(document); + insert_numbers_diff(numbers, vec![])?; + insert_strings_diff(strings, vec![])?; } (Values { numbers, strings }, Empty) => { - todo!(); add_is_empty.insert(document); + insert_numbers_diff(numbers, vec![])?; + insert_strings_diff(strings, vec![])?; } ( - Values { numbers: mut del_numbers, strings: mut del_strings }, - Values { numbers: mut add_numbers, strings: mut add_strings }, + Values { numbers: del_numbers, strings: del_strings }, + Values { numbers: add_numbers, strings: add_strings }, ) => { - // We sort and dedup the float numbers - del_numbers.sort_unstable_by_key(|f| OrderedFloat(*f)); - add_numbers.sort_unstable_by_key(|f| OrderedFloat(*f)); - del_numbers.dedup_by_key(|f| OrderedFloat(*f)); - add_numbers.dedup_by_key(|f| OrderedFloat(*f)); - - let merged_numbers_iter = itertools::merge_join_by( - del_numbers.into_iter().map(OrderedFloat), - add_numbers.into_iter().map(OrderedFloat), - |del, add| del.cmp(&add), - ); - - // insert facet numbers in sorter - for eob in merged_numbers_iter { - key_buffer - .truncate(size_of::() + size_of::()); - match eob { - EitherOrBoth::Both(_, _) => (), // no need to touch anything - EitherOrBoth::Left(OrderedFloat(number)) => { - if let Some(value_bytes) = f64_into_bytes(number) { - key_buffer.extend_from_slice(&value_bytes); - key_buffer.extend_from_slice(&number.to_be_bytes()); - - // We insert only the Del part of the Obkv to inform - // that we only want to remove all those numbers. - let mut obkv = KvWriterDelAdd::memory(); - obkv.insert(DelAdd::Deletion, ().as_bytes())?; - let bytes = obkv.into_inner()?; - fid_docid_facet_numbers_sorter - .insert(&key_buffer, bytes)?; - } - } - EitherOrBoth::Right(OrderedFloat(number)) => { - if let Some(value_bytes) = f64_into_bytes(number) { - key_buffer.extend_from_slice(&value_bytes); - key_buffer.extend_from_slice(&number.to_be_bytes()); - - // We insert only the Del part of the Obkv to inform - // that we only want to remove all those numbers. - let mut obkv = KvWriterDelAdd::memory(); - obkv.insert(DelAdd::Addition, ().as_bytes())?; - let bytes = obkv.into_inner()?; - fid_docid_facet_numbers_sorter - .insert(&key_buffer, bytes)?; - } - } - } - } - - // We sort and dedup the normalized and original strings - del_strings.sort_unstable(); - add_strings.sort_unstable(); - del_strings.dedup(); - add_strings.dedup(); - - let merged_strings_iter = itertools::merge_join_by( - del_strings.into_iter().filter(|(n, _)| !n.is_empty()), - add_strings.into_iter().filter(|(n, _)| !n.is_empty()), - |del, add| del.cmp(&add), - ); - - // insert normalized and original facet string in sorter - for eob in merged_strings_iter { - match eob { - EitherOrBoth::Both(_, _) => (), // no need to touch anything - EitherOrBoth::Left((normalized, original)) => { - let truncated = truncate_string(normalized); - - key_buffer.truncate( - size_of::() + size_of::(), - ); - key_buffer.extend_from_slice(truncated.as_bytes()); - - let mut obkv = KvWriterDelAdd::memory(); - obkv.insert(DelAdd::Deletion, original)?; - let bytes = obkv.into_inner()?; - fid_docid_facet_strings_sorter - .insert(&key_buffer, bytes)?; - } - EitherOrBoth::Right((normalized, original)) => { - let truncated = truncate_string(normalized); - - key_buffer.truncate( - size_of::() + size_of::(), - ); - key_buffer.extend_from_slice(truncated.as_bytes()); - - let mut obkv = KvWriterDelAdd::memory(); - obkv.insert(DelAdd::Addition, original)?; - let bytes = obkv.into_inner()?; - fid_docid_facet_strings_sorter - .insert(&key_buffer, bytes)?; - } - } - } + insert_numbers_diff(del_numbers, add_numbers)?; + insert_strings_diff(del_strings, add_strings)?; } } } @@ -320,19 +225,15 @@ pub fn extract_fid_docid_facet_values( } } + let mut buffer = Vec::new(); let mut facet_exists_docids_writer = create_writer( indexer.chunk_compression_type, indexer.chunk_compression_level, tempfile::tempfile()?, ); for (fid, (del_bitmap, add_bitmap)) in facet_exists_docids.into_iter() { - let mut obkv = KvWriterDelAdd::memory(); - let del_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&del_bitmap).unwrap(); - let add_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&add_bitmap).unwrap(); - obkv.insert(DelAdd::Deletion, del_bitmap_bytes)?; - obkv.insert(DelAdd::Addition, add_bitmap_bytes)?; - let bytes = obkv.into_inner()?; - facet_exists_docids_writer.insert(fid.to_be_bytes(), &bytes)?; + deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?; + facet_exists_docids_writer.insert(fid.to_be_bytes(), &buffer)?; } let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?; @@ -342,13 +243,8 @@ pub fn extract_fid_docid_facet_values( tempfile::tempfile()?, ); for (fid, (del_bitmap, add_bitmap)) in facet_is_null_docids.into_iter() { - let mut obkv = KvWriterDelAdd::memory(); - let del_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&del_bitmap).unwrap(); - let add_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&add_bitmap).unwrap(); - obkv.insert(DelAdd::Deletion, del_bitmap_bytes)?; - obkv.insert(DelAdd::Addition, add_bitmap_bytes)?; - let bytes = obkv.into_inner()?; - facet_is_null_docids_writer.insert(fid.to_be_bytes(), &bytes)?; + deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?; + facet_is_null_docids_writer.insert(fid.to_be_bytes(), &buffer)?; } let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?; @@ -358,13 +254,8 @@ pub fn extract_fid_docid_facet_values( tempfile::tempfile()?, ); for (fid, (del_bitmap, add_bitmap)) in facet_is_empty_docids.into_iter() { - let mut obkv = KvWriterDelAdd::memory(); - let del_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&del_bitmap).unwrap(); - let add_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&add_bitmap).unwrap(); - obkv.insert(DelAdd::Deletion, del_bitmap_bytes)?; - obkv.insert(DelAdd::Addition, add_bitmap_bytes)?; - let bytes = obkv.into_inner()?; - facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &bytes)?; + deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?; + facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &buffer)?; } let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?; @@ -377,6 +268,141 @@ pub fn extract_fid_docid_facet_values( }) } +/// Generates a vector of bytes containing a DelAdd obkv with two bitmaps. +fn deladd_obkv_cbo_roaring_bitmaps( + buffer: &mut Vec, + del_bitmap: &RoaringBitmap, + add_bitmap: &RoaringBitmap, +) -> io::Result<()> { + buffer.clear(); + let mut obkv = KvWriterDelAdd::new(buffer); + let del_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(del_bitmap).unwrap(); + let add_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(add_bitmap).unwrap(); + obkv.insert(DelAdd::Deletion, del_bitmap_bytes)?; + obkv.insert(DelAdd::Addition, add_bitmap_bytes)?; + obkv.finish() +} + +/// Truncates a string to the biggest valid LMDB key size. +fn truncate_string(s: String) -> String { + s.char_indices() + .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH) + .map(|(_, c)| c) + .collect() +} + +/// Computes the diff between both Del and Add numbers and +/// only inserts the parts that differ in the sorter. +fn insert_numbers_diff( + fid_docid_facet_numbers_sorter: &mut Sorter, + key_buffer: &mut Vec, + mut del_numbers: Vec, + mut add_numbers: Vec, +) -> Result<()> +where + MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult, Error>, +{ + // We sort and dedup the float numbers + del_numbers.sort_unstable_by_key(|f| OrderedFloat(*f)); + add_numbers.sort_unstable_by_key(|f| OrderedFloat(*f)); + del_numbers.dedup_by_key(|f| OrderedFloat(*f)); + add_numbers.dedup_by_key(|f| OrderedFloat(*f)); + + let merged_numbers_iter = itertools::merge_join_by( + del_numbers.into_iter().map(OrderedFloat), + add_numbers.into_iter().map(OrderedFloat), + |del, add| del.cmp(add), + ); + + // insert facet numbers in sorter + for eob in merged_numbers_iter { + key_buffer.truncate(TRUNCATE_SIZE); + match eob { + EitherOrBoth::Both(_, _) => (), // no need to touch anything + EitherOrBoth::Left(OrderedFloat(number)) => { + if let Some(value_bytes) = f64_into_bytes(number) { + key_buffer.extend_from_slice(&value_bytes); + key_buffer.extend_from_slice(&number.to_be_bytes()); + + // We insert only the Del part of the Obkv to inform + // that we only want to remove all those numbers. + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Deletion, ().as_bytes())?; + let bytes = obkv.into_inner()?; + fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?; + } + } + EitherOrBoth::Right(OrderedFloat(number)) => { + if let Some(value_bytes) = f64_into_bytes(number) { + key_buffer.extend_from_slice(&value_bytes); + key_buffer.extend_from_slice(&number.to_be_bytes()); + + // We insert only the Del part of the Obkv to inform + // that we only want to remove all those numbers. + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Addition, ().as_bytes())?; + let bytes = obkv.into_inner()?; + fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?; + } + } + } + } + + Ok(()) +} + +/// Computes the diff between both Del and Add strings and +/// only inserts the parts that differ in the sorter. +fn insert_strings_diff( + fid_docid_facet_strings_sorter: &mut Sorter, + key_buffer: &mut Vec, + mut del_strings: Vec<(String, String)>, + mut add_strings: Vec<(String, String)>, +) -> Result<()> +where + MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult, Error>, +{ + // We sort and dedup the normalized and original strings + del_strings.sort_unstable(); + add_strings.sort_unstable(); + del_strings.dedup(); + add_strings.dedup(); + + let merged_strings_iter = itertools::merge_join_by( + del_strings.into_iter().filter(|(n, _)| !n.is_empty()), + add_strings.into_iter().filter(|(n, _)| !n.is_empty()), + |del, add| del.cmp(add), + ); + + // insert normalized and original facet string in sorter + for eob in merged_strings_iter { + key_buffer.truncate(TRUNCATE_SIZE); + match eob { + EitherOrBoth::Both(_, _) => (), // no need to touch anything + EitherOrBoth::Left((normalized, original)) => { + let truncated = truncate_string(normalized); + key_buffer.extend_from_slice(truncated.as_bytes()); + + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Deletion, original)?; + let bytes = obkv.into_inner()?; + fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?; + } + EitherOrBoth::Right((normalized, original)) => { + let truncated = truncate_string(normalized); + key_buffer.extend_from_slice(truncated.as_bytes()); + + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Addition, original)?; + let bytes = obkv.into_inner()?; + fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?; + } + } + } + + Ok(()) +} + /// Represent what a document field contains. enum FilterableValues { /// Corresponds to the JSON `null` value. @@ -387,6 +413,7 @@ enum FilterableValues { Values { numbers: Vec, strings: Vec<(String, String)> }, } +/// Extracts the facet values of a JSON field. fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues { fn inner_extract_facet_values( value: &Value, @@ -448,10 +475,3 @@ fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues { } } } - -fn truncate_string(mut s: String) -> String { - s.char_indices() - .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH) - .map(|(_, c)| c) - .collect() -} From a82dee21e09dcf4d55ed604478bfa4aa4e7e6da3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 18 Oct 2023 13:53:58 +0200 Subject: [PATCH 019/127] Rename docid_fid into fid_docid --- .../extract/extract_fid_docid_facet_values.rs | 12 +++---- .../src/update/index_documents/extract/mod.rs | 36 +++++++++---------- 2 files changed, 22 insertions(+), 26 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index ec0960b86..87320a675 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -29,9 +29,8 @@ const TRUNCATE_SIZE: usize = size_of::() + size_of::(); /// The extracted facet values stored in grenad files by type. pub struct ExtractedFacetValues { - // TOOD rename into `fid_docid_*` - pub docid_fid_facet_numbers_chunk: grenad::Reader>, - pub docid_fid_facet_strings_chunk: grenad::Reader>, + pub fid_docid_facet_numbers_chunk: grenad::Reader>, + pub fid_docid_facet_strings_chunk: grenad::Reader>, pub fid_facet_is_null_docids_chunk: grenad::Reader>, pub fid_facet_is_empty_docids_chunk: grenad::Reader>, pub fid_facet_exists_docids_chunk: grenad::Reader>, @@ -44,7 +43,6 @@ pub struct ExtractedFacetValues { /// We need the fid of the geofields to correctly parse them as numbers if they were sent as strings initially. #[logging_timer::time] pub fn extract_fid_docid_facet_values( - // TODO Reader>> obkv_documents: grenad::Reader, indexer: GrenadParameters, faceted_fields: &HashSet, @@ -83,7 +81,6 @@ pub fn extract_fid_docid_facet_values( let mut cursor = obkv_documents.into_cursor()?; while let Some((docid_bytes, value)) = cursor.move_on_next()? { - // TODO Obkv> let obkv = obkv::KvReader::new(value); for (field_id, field_bytes) in obkv.iter() { @@ -96,7 +93,6 @@ pub fn extract_fid_docid_facet_values( numbers_key_buffer.extend_from_slice(&field_id.to_be_bytes()); strings_key_buffer.extend_from_slice(&field_id.to_be_bytes()); - // Here, we know already that the document must be added to the “field id exists” database let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap(); let document = BEU32::from(document).get(); @@ -260,8 +256,8 @@ pub fn extract_fid_docid_facet_values( let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?; Ok(ExtractedFacetValues { - docid_fid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?, - docid_fid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?, + fid_docid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?, + fid_docid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?, fid_facet_is_null_docids_chunk: facet_is_null_docids_reader, fid_facet_is_empty_docids_chunk: facet_is_empty_docids_reader, fid_facet_exists_docids_chunk: facet_exists_docids_reader, diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 164f95452..0522fc93c 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -92,9 +92,9 @@ pub(crate) fn data_from_obkv_documents( let ( docid_word_positions_chunks, ( - docid_fid_facet_numbers_chunks, + fid_docid_facet_numbers_chunks, ( - docid_fid_facet_strings_chunks, + fid_docid_facet_strings_chunks, ( facet_is_null_docids_chunks, (facet_is_empty_docids_chunks, facet_exists_docids_chunks), @@ -206,7 +206,7 @@ pub(crate) fn data_from_obkv_documents( ); spawn_extraction_task::<_, _, Vec>>>( - docid_fid_facet_strings_chunks, + fid_docid_facet_strings_chunks, indexer, lmdb_writer_sx.clone(), extract_facet_string_docids, @@ -216,7 +216,7 @@ pub(crate) fn data_from_obkv_documents( ); spawn_extraction_task::<_, _, Vec>>>( - docid_fid_facet_numbers_chunks, + fid_docid_facet_numbers_chunks, indexer, lmdb_writer_sx, extract_facet_number_docids, @@ -352,7 +352,7 @@ fn send_and_extract_flattened_documents_data( }); } - let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) = + let (docid_word_positions_chunk, fid_docid_facet_values_chunks): (Result<_>, Result<_>) = rayon::join( || { let (documents_ids, docid_word_positions_chunk, script_language_pair) = @@ -380,8 +380,8 @@ fn send_and_extract_flattened_documents_data( }, || { let ExtractedFacetValues { - docid_fid_facet_numbers_chunk, - docid_fid_facet_strings_chunk, + fid_docid_facet_numbers_chunk, + fid_docid_facet_strings_chunk, fid_facet_is_null_docids_chunk, fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk, @@ -392,26 +392,26 @@ fn send_and_extract_flattened_documents_data( geo_fields_ids, )?; - // send docid_fid_facet_numbers_chunk to DB writer - let docid_fid_facet_numbers_chunk = - unsafe { as_cloneable_grenad(&docid_fid_facet_numbers_chunk)? }; + // send fid_docid_facet_numbers_chunk to DB writer + let fid_docid_facet_numbers_chunk = + unsafe { as_cloneable_grenad(&fid_docid_facet_numbers_chunk)? }; let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetNumbers( - docid_fid_facet_numbers_chunk.clone(), + fid_docid_facet_numbers_chunk.clone(), ))); - // send docid_fid_facet_strings_chunk to DB writer - let docid_fid_facet_strings_chunk = - unsafe { as_cloneable_grenad(&docid_fid_facet_strings_chunk)? }; + // send fid_docid_facet_strings_chunk to DB writer + let fid_docid_facet_strings_chunk = + unsafe { as_cloneable_grenad(&fid_docid_facet_strings_chunk)? }; let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetStrings( - docid_fid_facet_strings_chunk.clone(), + fid_docid_facet_strings_chunk.clone(), ))); Ok(( - docid_fid_facet_numbers_chunk, + fid_docid_facet_numbers_chunk, ( - docid_fid_facet_strings_chunk, + fid_docid_facet_strings_chunk, ( fid_facet_is_null_docids_chunk, (fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk), @@ -421,5 +421,5 @@ fn send_and_extract_flattened_documents_data( }, ); - Ok((docid_word_positions_chunk?, docid_fid_facet_values_chunks?)) + Ok((docid_word_positions_chunk?, fid_docid_facet_values_chunks?)) } From fcd3a1434d2a8e6da49a5a86d0591bd872d3de29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 18 Oct 2023 17:40:13 +0200 Subject: [PATCH 020/127] Update extract_facet_number_docids to support deladd obkvs --- .../cbo_roaring_bitmap_codec.rs | 10 ++++-- .../extract/extract_facet_number_docids.rs | 26 ++++++++++------ .../helpers/merge_functions.rs | 31 +++++++++++++++++++ .../src/update/index_documents/helpers/mod.rs | 5 +-- 4 files changed, 57 insertions(+), 15 deletions(-) diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index bf76287d8..79b52695e 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -60,12 +60,16 @@ impl CboRoaringBitmapCodec { /// if the merged values length is under the threshold, values are directly /// serialized in the buffer else a RoaringBitmap is created from the /// values and is serialized in the buffer. - pub fn merge_into(slices: &[Cow<[u8]>], buffer: &mut Vec) -> io::Result<()> { + pub fn merge_into(slices: I, buffer: &mut Vec) -> io::Result<()> + where + I: IntoIterator, + A: AsRef<[u8]>, + { let mut roaring = RoaringBitmap::new(); let mut vec = Vec::new(); for bytes in slices { - if bytes.len() <= THRESHOLD * size_of::() { + if bytes.as_ref().len() <= THRESHOLD * size_of::() { let mut reader = bytes.as_ref(); while let Ok(integer) = reader.read_u32::() { vec.push(integer); @@ -85,7 +89,7 @@ impl CboRoaringBitmapCodec { } } else { // We can unwrap safely because the vector is sorted upper. - let roaring = RoaringBitmap::from_sorted_iter(vec.into_iter()).unwrap(); + let roaring = RoaringBitmap::from_sorted_iter(vec).unwrap(); roaring.serialize_into(buffer)?; } } else { diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs index 76dc6d3c6..f860aacba 100644 --- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs @@ -4,11 +4,12 @@ use std::io::{self, BufReader}; use heed::{BytesDecode, BytesEncode}; use super::helpers::{ - create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, + create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, }; use crate::heed_codec::facet::{ FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec, }; +use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd}; use crate::Result; /// Extracts the facet number and the documents ids where this facet number appear. @@ -17,8 +18,7 @@ use crate::Result; /// documents ids from the given chunk of docid facet number positions. #[logging_timer::time] pub fn extract_facet_number_docids( - // TODO Reader> - docid_fid_facet_number: grenad::Reader, + fid_docid_facet_number: grenad::Reader, indexer: GrenadParameters, ) -> Result>> { puffin::profile_function!(); @@ -27,24 +27,30 @@ pub fn extract_facet_number_docids( let mut facet_number_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - // TODO We must modify the merger to do unions of Del and Add separately - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory, ); - let mut cursor = docid_fid_facet_number.into_cursor()?; - // TODO the value is a Obkv and must be taken into account - while let Some((key_bytes, _)) = cursor.move_on_next()? { + let mut buffer = Vec::new(); + let mut cursor = fid_docid_facet_number.into_cursor()?; + while let Some((key_bytes, deladd_obkv_bytes)) = cursor.move_on_next()? { let (field_id, document_id, number) = FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap(); let key = FacetGroupKey { field_id, level: 0, left_bound: number }; let key_bytes = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); - // TODO We must put a Obkv - facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; + + buffer.clear(); + let mut obkv = KvWriterDelAdd::new(&mut buffer); + for (deladd_key, _) in KvReaderDelAdd::new(deladd_obkv_bytes).iter() { + obkv.insert(deladd_key, document_id.to_ne_bytes())?; + } + obkv.finish()?; + + facet_number_docids_sorter.insert(key_bytes, &buffer)?; } sorter_into_reader(facet_number_docids_sorter, indexer) diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs index dee200b21..a418f8786 100644 --- a/milli/src/update/index_documents/helpers/merge_functions.rs +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -205,3 +205,34 @@ pub fn merge_cbo_roaring_bitmaps<'a>( Ok(Cow::from(vec)) } } + +pub fn merge_deladd_cbo_roaring_bitmaps<'a>( + _key: &[u8], + values: &[Cow<'a, [u8]>], +) -> Result> { + if values.len() == 1 { + Ok(values[0].clone()) + } else { + // Retrieve the bitmaps from both sides + let mut del_bitmaps_bytes = Vec::new(); + let mut add_bitmaps_bytes = Vec::new(); + for value in values { + let obkv = KvReaderDelAdd::new(value); + if let Some(bitmap_bytes) = obkv.get(DelAdd::Deletion) { + del_bitmaps_bytes.push(bitmap_bytes); + } + if let Some(bitmap_bytes) = obkv.get(DelAdd::Addition) { + add_bitmaps_bytes.push(bitmap_bytes); + } + } + + let mut output_deladd_obkv = KvWriterDelAdd::memory(); + let mut buffer = Vec::new(); + CboRoaringBitmapCodec::merge_into(del_bitmaps_bytes, &mut buffer)?; + output_deladd_obkv.insert(DelAdd::Deletion, &buffer)?; + buffer.clear(); + CboRoaringBitmapCodec::merge_into(add_bitmaps_bytes, &mut buffer)?; + output_deladd_obkv.insert(DelAdd::Addition, &buffer)?; + output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into) + } +} diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index 8f70a2de2..1f2f8e6ef 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -14,8 +14,9 @@ pub use grenad_helpers::{ }; pub use merge_functions::{ concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string, - merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, obkvs_keep_last_addition_merge_deletions, - obkvs_merge_additions_and_deletions, serialize_roaring_bitmap, MergeFn, + merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps, merge_roaring_bitmaps, + obkvs_keep_last_addition_merge_deletions, obkvs_merge_additions_and_deletions, + serialize_roaring_bitmap, MergeFn, }; use crate::MAX_WORD_LENGTH; From e2bc054604c96f9fefc036a1dcec4aa9ec9ae4b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 18 Oct 2023 18:06:41 +0200 Subject: [PATCH 021/127] Update extract_facet_string_docids to support deladd obkvs --- .../extract/extract_facet_string_docids.rs | 41 ++++++++----------- .../helpers/merge_functions.rs | 3 ++ 2 files changed, 21 insertions(+), 23 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index b861c04e4..2ade776c3 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -1,13 +1,15 @@ use std::fs::File; -use std::io::{self, BufReader}; +use std::io::BufReader; +use std::{io, str}; use heed::BytesEncode; use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; use crate::heed_codec::StrRefCodec; -use crate::update::index_documents::merge_cbo_roaring_bitmaps; -use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH}; +use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd}; +use crate::update::index_documents::helpers::merge_deladd_cbo_roaring_bitmaps; +use crate::{FieldId, Result}; /// Extracts the facet string and the documents ids where this facet string appear. /// @@ -15,7 +17,6 @@ use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH}; /// documents ids from the given chunk of docid facet string positions. #[logging_timer::time] pub fn extract_facet_string_docids( - // TODO Reader> docid_fid_facet_string: grenad::Reader, indexer: GrenadParameters, ) -> Result>> { @@ -25,17 +26,16 @@ pub fn extract_facet_string_docids( let mut facet_string_docids_sorter = create_sorter( grenad::SortAlgorithm::Stable, - // TODO We must modify the merger to do unions of Del and Add separately - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory, ); + let mut buffer = Vec::new(); let mut cursor = docid_fid_facet_string.into_cursor()?; - while let Some((key, _original_value_bytes)) = cursor.move_on_next()? { - // TODO the value is a Obkv and must be taken into account + while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? { let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); let field_id = FieldId::from_be_bytes(field_id_bytes); @@ -43,22 +43,17 @@ pub fn extract_facet_string_docids( try_split_array_at::<_, 4>(bytes).unwrap(); let document_id = u32::from_be_bytes(document_id_bytes); - let mut normalised_value = std::str::from_utf8(normalized_value_bytes)?; - - let normalised_truncated_value: String; - if normalised_value.len() > MAX_FACET_VALUE_LENGTH { - normalised_truncated_value = normalised_value - .char_indices() - .take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH) - .map(|(_, c)| c) - .collect(); - normalised_value = normalised_truncated_value.as_str(); - } - let key = FacetGroupKey { field_id, level: 0, left_bound: normalised_value }; + let normalized_value = str::from_utf8(normalized_value_bytes)?; + let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value }; let key_bytes = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); - // document id is encoded in native-endian because of the CBO roaring bitmap codec - // TODO Reader> - facet_string_docids_sorter.insert(&key_bytes, document_id.to_ne_bytes())?; + + buffer.clear(); + let mut obkv = KvWriterDelAdd::new(&mut buffer); + for (deladd_key, _) in KvReaderDelAdd::new(deladd_original_value_bytes).iter() { + obkv.insert(deladd_key, document_id.to_ne_bytes())?; + } + obkv.finish()?; + facet_string_docids_sorter.insert(&key_bytes, &buffer)?; } sorter_into_reader(facet_string_docids_sorter, indexer) diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs index a418f8786..770629c8e 100644 --- a/milli/src/update/index_documents/helpers/merge_functions.rs +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -193,6 +193,7 @@ pub fn obkvs_keep_last_addition_merge_deletions<'a>( inner_merge_del_add_obkvs(obkvs, false) } +/// Do a union of all the CboRoaringBitmaps in the values. pub fn merge_cbo_roaring_bitmaps<'a>( _key: &[u8], values: &[Cow<'a, [u8]>], @@ -206,6 +207,8 @@ pub fn merge_cbo_roaring_bitmaps<'a>( } } +/// Do a union of CboRoaringBitmaps on both sides of a DelAdd obkv +/// separately and outputs a new DelAdd with both unions. pub fn merge_deladd_cbo_roaring_bitmaps<'a>( _key: &[u8], values: &[Cow<'a, [u8]>], From 2597bbd107215938b6c6dd9e0c4176e8a564e8ad Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 19 Oct 2023 10:22:39 +0200 Subject: [PATCH 022/127] Make script language docids map taking a tuple of roaring bitmaps expressing the deletions and the additions --- .../extract/extract_docid_word_positions.rs | 24 ++++------ .../src/update/index_documents/typed_chunk.rs | 48 ++++++++----------- 2 files changed, 29 insertions(+), 43 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index e02e492d2..36258b275 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -14,7 +14,7 @@ use crate::error::{InternalError, SerializationError}; use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd}; use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH}; -pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), RoaringBitmap>; +pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>; /// Extracts the word and positions where this word appear and /// prefixes it by the document id. @@ -30,11 +30,7 @@ pub fn extract_docid_word_positions( allowed_separators: Option<&[&str]>, dictionary: Option<&[&str]>, max_positions_per_attributes: Option, -) -> Result<( - RoaringBitmap, - grenad::Reader>, - (ScriptLanguageDocidsMap, ScriptLanguageDocidsMap), -)> { +) -> Result<(RoaringBitmap, grenad::Reader>, ScriptLanguageDocidsMap)> { puffin::profile_function!(); let max_positions_per_attributes = max_positions_per_attributes @@ -43,8 +39,7 @@ pub fn extract_docid_word_positions( // initialize destination values. let mut documents_ids = RoaringBitmap::new(); - let mut del_script_language_docids = HashMap::new(); - let mut add_script_language_docids = HashMap::new(); + let mut script_language_docids = HashMap::new(); let mut docid_word_positions_sorter = create_sorter( grenad::SortAlgorithm::Stable, keep_latest_obkv, @@ -138,25 +133,24 @@ pub fn extract_docid_word_positions( // update script_language_docids deletions. for (script, languages_frequency) in del_script_language_word_count { for (language, _) in languages_frequency { - let entry = del_script_language_docids + let entry = script_language_docids .entry((script, language)) - .or_insert_with(RoaringBitmap::new); - entry.push(document_id); + .or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new())); + entry.0.push(document_id); } } // update script_language_docids additions. for (script, languages_frequency) in add_script_language_word_count { for (language, _) in languages_frequency { - let entry = add_script_language_docids + let entry = script_language_docids .entry((script, language)) - .or_insert_with(RoaringBitmap::new); - entry.push(document_id); + .or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new())); + entry.1.push(document_id); } } } - let script_language_docids = (del_script_language_docids, add_script_language_docids); sorter_into_reader(docid_word_positions_sorter, indexer) .map(|reader| (documents_ids, reader, script_language_docids)) } diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index f2dc7d336..e3ff9b253 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -43,9 +43,7 @@ pub(crate) enum TypedChunk { FieldIdFacetIsEmptyDocids(grenad::Reader>), GeoPoints(grenad::Reader>), VectorPoints(grenad::Reader>), - ScriptLanguageDocids( - (HashMap<(Script, Language), RoaringBitmap>, HashMap<(Script, Language), RoaringBitmap>), - ), + ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>), } impl TypedChunk { @@ -103,8 +101,8 @@ impl TypedChunk { TypedChunk::VectorPoints(grenad) => { format!("VectorPoints {{ number_of_entries: {} }}", grenad.len()) } - TypedChunk::ScriptLanguageDocids((_, addition)) => { - format!("ScriptLanguageDocids {{ number_of_entries: {} }}", addition.len()) + TypedChunk::ScriptLanguageDocids(sl_map) => { + format!("ScriptLanguageDocids {{ number_of_entries: {} }}", sl_map.len()) } } } @@ -346,24 +344,25 @@ pub(crate) fn write_typed_chunk_into_index( log::debug!("There are {} entries in the HNSW so far", hnsw_length); index.put_vector_hnsw(wtxn, &new_hnsw)?; } - TypedChunk::ScriptLanguageDocids((deletion, addition)) => { - for (key, value) in deletion { - if let Some(mut db_values) = index.script_language_docids.get(wtxn, &key)? { - db_values -= value; - if db_values.is_empty() { - index.script_language_docids.delete(wtxn, &key)?; - } else { - index.script_language_docids.put(wtxn, &key, &db_values)?; - } - } - } - - for (key, value) in addition { + TypedChunk::ScriptLanguageDocids(sl_map) => { + for (key, (deletion, addition)) in sl_map { + let mut db_key_exists = false; let final_value = match index.script_language_docids.get(wtxn, &key)? { - Some(mut db_values) => db_values | value, - None => value, + Some(db_values) => { + db_key_exists = true; + (db_values - deletion) | addition + } + None => addition, }; - index.script_language_docids.put(wtxn, &key, &final_value)?; + + if final_value.is_empty() { + // If the database entry exists, delete it. + if db_key_exists == true { + index.script_language_docids.delete(wtxn, &key)?; + } + } else { + index.script_language_docids.put(wtxn, &key, &final_value)?; + } } } } @@ -388,13 +387,6 @@ fn merge_word_docids_reader_into_fst( Ok(builder.into_set()) } -fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec) -> Result<()> { - let new_value = RoaringBitmap::deserialize_from(new_value)?; - let db_value = RoaringBitmap::deserialize_from(db_value)?; - let value = new_value | db_value; - Ok(serialize_roaring_bitmap(&value, buffer)?) -} - fn merge_cbo_roaring_bitmaps( new_value: &[u8], db_value: &[u8], From 46aa75abdb5fd1f25965aa8511344f06692776eb Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 19 Oct 2023 11:58:31 +0200 Subject: [PATCH 023/127] update extract word docids --- milli/src/update/del_add.rs | 4 ++ .../extract/extract_word_docids.rs | 70 +++++++++++++++---- 2 files changed, 59 insertions(+), 15 deletions(-) diff --git a/milli/src/update/del_add.rs b/milli/src/update/del_add.rs index 346ae0afa..c8b7f0f6a 100644 --- a/milli/src/update/del_add.rs +++ b/milli/src/update/del_add.rs @@ -98,3 +98,7 @@ pub fn del_add_from_two_obkvs( writer.finish() } + +pub fn is_noop_del_add_obkv(del_add: KvReaderDelAdd) -> bool { + del_add.get(DelAdd::Deletion) == del_add.get(DelAdd::Addition) +} diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index 3df962585..a95162236 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -7,12 +7,13 @@ use heed::BytesDecode; use obkv::KvReaderU16; use super::helpers::{ - create_sorter, create_writer, merge_cbo_roaring_bitmaps, sorter_into_reader, + create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at, writer_into_reader, GrenadParameters, }; use crate::error::SerializationError; use crate::heed_codec::StrBEU16Codec; use crate::index::db_name::DOCID_WORD_POSITIONS; +use crate::update::del_add::{is_noop_del_add_obkv, DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::MergeFn; use crate::{DocumentId, FieldId, Result}; @@ -39,14 +40,15 @@ pub fn extract_word_docids( let mut word_fid_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory.map(|x| x / 3), ); let mut key_buffer = Vec::new(); - let mut words = BTreeSet::new(); + let mut del_words = BTreeSet::new(); + let mut add_words = BTreeSet::new(); let mut cursor = docid_word_positions.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { let (document_id_bytes, fid_bytes) = try_split_array_at(key) @@ -56,24 +58,37 @@ pub fn extract_word_docids( let document_id = u32::from_be_bytes(document_id_bytes); let fid = u16::from_be_bytes(fid_bytes); - for (_pos, word) in KvReaderU16::new(&value).iter() { - words.insert(word.to_vec()); + let del_add_reader = KvReaderDelAdd::new(&value); + // extract all unique words to remove. + if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) { + for (_pos, word) in KvReaderU16::new(&deletion).iter() { + del_words.insert(word.to_vec()); + } + } + + // extract all unique additional words. + if let Some(addition) = del_add_reader.get(DelAdd::Addition) { + for (_pos, word) in KvReaderU16::new(&addition).iter() { + add_words.insert(word.to_vec()); + } } words_into_sorter( document_id, fid, &mut key_buffer, - &mut words, + &del_words, + &add_words, &mut word_fid_docids_sorter, )?; - words.clear(); + del_words.clear(); + add_words.clear(); } let mut word_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, @@ -82,7 +97,7 @@ pub fn extract_word_docids( let mut exact_word_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, @@ -96,8 +111,12 @@ pub fn extract_word_docids( ); let mut iter = word_fid_docids_sorter.into_stream_merger_iter()?; + // TODO: replace sorters by writers by accumulating values into a buffer before inserting them. while let Some((key, value)) = iter.next()? { - word_fid_docids_writer.insert(key, value)?; + // only keep the value if their is a change to apply in the DB. + if !is_noop_del_add_obkv(KvReaderDelAdd::new(value)) { + word_fid_docids_writer.insert(key, value)?; + } let (word, fid) = StrBEU16Codec::bytes_decode(key) .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; @@ -121,20 +140,41 @@ fn words_into_sorter( document_id: DocumentId, fid: FieldId, key_buffer: &mut Vec, - words: &mut BTreeSet>, + del_words: &BTreeSet>, + add_words: &BTreeSet>, word_fid_docids_sorter: &mut grenad::Sorter, ) -> Result<()> { puffin::profile_function!(); - for word_bytes in words.iter() { + use itertools::merge_join_by; + use itertools::EitherOrBoth::{Both, Left, Right}; + + let mut buffer = Vec::new(); + for eob in merge_join_by(del_words.iter(), add_words.iter(), |d, a| d.cmp(a)) { + buffer.clear(); + let mut value_writer = KvWriterDelAdd::new(&mut buffer); + let word_bytes = match eob { + Left(word_bytes) => { + value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); + word_bytes + } + Right(word_bytes) => { + value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); + word_bytes + } + Both(word_bytes, _) => { + value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); + value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); + word_bytes + } + }; + key_buffer.clear(); key_buffer.extend_from_slice(&word_bytes); key_buffer.push(0); key_buffer.extend_from_slice(&fid.to_be_bytes()); - word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; + word_fid_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?; } - words.clear(); - Ok(()) } From 6bcf8b4f8cab1b58be1a96b76c14c8fa056ef17e Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 19 Oct 2023 13:27:07 +0200 Subject: [PATCH 024/127] update extract word position docids --- .../extract/extract_word_position_docids.rs | 105 ++++++++++++++---- 1 file changed, 82 insertions(+), 23 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index 220dca960..2ff2f2ad5 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -1,15 +1,17 @@ -use std::collections::HashSet; +use std::collections::BTreeSet; use std::fs::File; use std::io::{self, BufReader}; use obkv::KvReaderU16; use super::helpers::{ - create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at, + create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at, GrenadParameters, }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; +use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; +use crate::update::MergeFn; use crate::{bucketed_position, DocumentId, Result}; /// Extracts the word positions and the documents ids where this word appear. @@ -27,14 +29,15 @@ pub fn extract_word_position_docids( let mut word_position_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory, ); - let mut word_positions: HashSet<(u16, Vec)> = HashSet::new(); + let mut del_word_positions: BTreeSet<(u16, Vec)> = BTreeSet::new(); + let mut add_word_positions: BTreeSet<(u16, Vec)> = BTreeSet::new(); let mut current_document_id: Option = None; let mut key_buffer = Vec::new(); let mut cursor = docid_word_positions.into_cursor()?; @@ -44,36 +47,92 @@ pub fn extract_word_position_docids( let document_id = DocumentId::from_be_bytes(document_id_bytes); if current_document_id.map_or(false, |id| document_id != id) { - for (position, word_bytes) in word_positions.iter() { - key_buffer.clear(); - key_buffer.extend_from_slice(word_bytes); - key_buffer.push(0); - key_buffer.extend_from_slice(&position.to_be_bytes()); - word_position_docids_sorter - .insert(&key_buffer, current_document_id.unwrap().to_ne_bytes())?; - } - word_positions.clear(); + words_position_into_sorter( + current_document_id.unwrap(), + &mut key_buffer, + &del_word_positions, + &add_word_positions, + &mut word_position_docids_sorter, + )?; + del_word_positions.clear(); + add_word_positions.clear(); } current_document_id = Some(document_id); - for (position, word_bytes) in KvReaderU16::new(&value).iter() { - let position = bucketed_position(position); - word_positions.insert((position, word_bytes.to_vec())); + let del_add_reader = KvReaderDelAdd::new(&value); + // extract all unique words to remove. + if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) { + for (position, word_bytes) in KvReaderU16::new(deletion).iter() { + let position = bucketed_position(position); + del_word_positions.insert((position, word_bytes.to_vec())); + } + } + + // extract all unique additional words. + if let Some(addition) = del_add_reader.get(DelAdd::Addition) { + for (position, word_bytes) in KvReaderU16::new(addition).iter() { + let position = bucketed_position(position); + add_word_positions.insert((position, word_bytes.to_vec())); + } } } if let Some(document_id) = current_document_id { - for (position, word_bytes) in word_positions { - key_buffer.clear(); - key_buffer.extend_from_slice(&word_bytes); - key_buffer.push(0); - key_buffer.extend_from_slice(&position.to_be_bytes()); - word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; - } + words_position_into_sorter( + document_id, + &mut key_buffer, + &del_word_positions, + &add_word_positions, + &mut word_position_docids_sorter, + )?; } + // TODO remove noop DelAdd OBKV let word_position_docids_reader = sorter_into_reader(word_position_docids_sorter, indexer)?; Ok(word_position_docids_reader) } + +fn words_position_into_sorter( + document_id: DocumentId, + key_buffer: &mut Vec, + del_word_positions: &BTreeSet<(u16, Vec)>, + add_word_positions: &BTreeSet<(u16, Vec)>, + word_position_docids_sorter: &mut grenad::Sorter, +) -> Result<()> { + puffin::profile_function!(); + + use itertools::merge_join_by; + use itertools::EitherOrBoth::{Both, Left, Right}; + + let mut buffer = Vec::new(); + for eob in merge_join_by(del_word_positions.iter(), add_word_positions.iter(), |d, a| d.cmp(a)) + { + buffer.clear(); + let mut value_writer = KvWriterDelAdd::new(&mut buffer); + let (position, word_bytes) = match eob { + Left(key) => { + value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); + key + } + Right(key) => { + value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); + key + } + Both(key, _) => { + value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); + value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); + key + } + }; + + key_buffer.clear(); + key_buffer.extend_from_slice(word_bytes); + key_buffer.push(0); + key_buffer.extend_from_slice(&position.to_be_bytes()); + word_position_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?; + } + + Ok(()) +} From 87e3d278786ad90f60014bc4e92d9a24adc14afd Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 19 Oct 2023 14:18:14 +0200 Subject: [PATCH 025/127] update extract word pair proximity to support deladd obkvs --- .../extract_word_pair_proximity_docids.rs | 147 +++++++++++++----- 1 file changed, 109 insertions(+), 38 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 70865acbe..76a1d1d68 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -1,4 +1,4 @@ -use std::collections::{HashMap, VecDeque}; +use std::collections::{BTreeMap, VecDeque}; use std::fs::File; use std::io::BufReader; use std::{cmp, io}; @@ -6,12 +6,13 @@ use std::{cmp, io}; use obkv::KvReaderU16; use super::helpers::{ - create_sorter, create_writer, merge_cbo_roaring_bitmaps, sorter_into_reader, - try_split_array_at, writer_into_reader, GrenadParameters, MergeFn, + create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, try_split_array_at, + writer_into_reader, GrenadParameters, MergeFn, }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; use crate::proximity::{index_proximity, MAX_DISTANCE}; +use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::{DocumentId, Result}; /// Extracts the best proximity between pairs of words and the documents ids where this pair appear. @@ -32,7 +33,7 @@ pub fn extract_word_pair_proximity_docids( .map(|_| { create_sorter( grenad::SortAlgorithm::Unstable, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, @@ -41,9 +42,12 @@ pub fn extract_word_pair_proximity_docids( }) .collect(); - let mut word_positions: VecDeque<(String, u16)> = + let mut del_word_positions: VecDeque<(String, u16)> = VecDeque::with_capacity(MAX_DISTANCE as usize); - let mut word_pair_proximity = HashMap::new(); + let mut add_word_positions: VecDeque<(String, u16)> = + VecDeque::with_capacity(MAX_DISTANCE as usize); + let mut del_word_pair_proximity = BTreeMap::new(); + let mut add_word_pair_proximity = BTreeMap::new(); let mut current_document_id = None; let mut cursor = docid_word_positions.into_cursor()?; @@ -55,50 +59,90 @@ pub fn extract_word_pair_proximity_docids( // if we change document, we fill the sorter if current_document_id.map_or(false, |id| id != document_id) { puffin::profile_scope!("Document into sorter"); - while !word_positions.is_empty() { - word_positions_into_word_pair_proximity( - &mut word_positions, - &mut word_pair_proximity, - )?; - } document_word_positions_into_sorter( current_document_id.unwrap(), - &word_pair_proximity, + &del_word_pair_proximity, + &add_word_pair_proximity, &mut word_pair_proximity_docids_sorters, )?; - word_pair_proximity.clear(); - word_positions.clear(); + del_word_pair_proximity.clear(); + add_word_pair_proximity.clear(); } current_document_id = Some(document_id); - for (position, word) in KvReaderU16::new(&value).iter() { - // drain the proximity window until the head word is considered close to the word we are inserting. - while word_positions.get(0).map_or(false, |(_w, p)| { - index_proximity(*p as u32, position as u32) >= MAX_DISTANCE - }) { - word_positions_into_word_pair_proximity( - &mut word_positions, - &mut word_pair_proximity, - )?; - } + let (del, add): (Result<_>, Result<_>) = rayon::join( + || { + // deletions + if let Some(deletion) = KvReaderDelAdd::new(&value).get(DelAdd::Deletion) { + for (position, word) in KvReaderU16::new(deletion).iter() { + // drain the proximity window until the head word is considered close to the word we are inserting. + while del_word_positions.get(0).map_or(false, |(_w, p)| { + index_proximity(*p as u32, position as u32) >= MAX_DISTANCE + }) { + word_positions_into_word_pair_proximity( + &mut del_word_positions, + &mut del_word_pair_proximity, + )?; + } - // insert the new word. - let word = std::str::from_utf8(word)?; - word_positions.push_back((word.to_string(), position)); - } + // insert the new word. + let word = std::str::from_utf8(word)?; + del_word_positions.push_back((word.to_string(), position)); + } + + while !del_word_positions.is_empty() { + word_positions_into_word_pair_proximity( + &mut del_word_positions, + &mut del_word_pair_proximity, + )?; + } + } + + Ok(()) + }, + || { + // additions + if let Some(addition) = KvReaderDelAdd::new(&value).get(DelAdd::Addition) { + for (position, word) in KvReaderU16::new(addition).iter() { + // drain the proximity window until the head word is considered close to the word we are inserting. + while add_word_positions.get(0).map_or(false, |(_w, p)| { + index_proximity(*p as u32, position as u32) >= MAX_DISTANCE + }) { + word_positions_into_word_pair_proximity( + &mut add_word_positions, + &mut add_word_pair_proximity, + )?; + } + + // insert the new word. + let word = std::str::from_utf8(word)?; + add_word_positions.push_back((word.to_string(), position)); + } + + while !add_word_positions.is_empty() { + word_positions_into_word_pair_proximity( + &mut add_word_positions, + &mut add_word_pair_proximity, + )?; + } + } + + Ok(()) + }, + ); + + del?; + add?; } if let Some(document_id) = current_document_id { puffin::profile_scope!("Final document into sorter"); - while !word_positions.is_empty() { - word_positions_into_word_pair_proximity(&mut word_positions, &mut word_pair_proximity)?; - } - document_word_positions_into_sorter( document_id, - &word_pair_proximity, + &del_word_pair_proximity, + &add_word_pair_proximity, &mut word_pair_proximity_docids_sorters, )?; } @@ -124,11 +168,38 @@ pub fn extract_word_pair_proximity_docids( /// close to each other. fn document_word_positions_into_sorter( document_id: DocumentId, - word_pair_proximity: &HashMap<(String, String), u8>, + del_word_pair_proximity: &BTreeMap<(String, String), u8>, + add_word_pair_proximity: &BTreeMap<(String, String), u8>, word_pair_proximity_docids_sorters: &mut Vec>, ) -> Result<()> { + use itertools::merge_join_by; + use itertools::EitherOrBoth::{Both, Left, Right}; + + let mut buffer = Vec::new(); let mut key_buffer = Vec::new(); - for ((w1, w2), prox) in word_pair_proximity { + for eob in + merge_join_by(del_word_pair_proximity.iter(), add_word_pair_proximity.iter(), |d, a| { + d.cmp(a) + }) + { + buffer.clear(); + let mut value_writer = KvWriterDelAdd::new(&mut buffer); + let ((w1, w2), prox) = match eob { + Left(key_value) => { + value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); + key_value + } + Right(key_value) => { + value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); + key_value + } + Both(key_value, _) => { + value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); + value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); + key_value + } + }; + key_buffer.clear(); key_buffer.push(*prox as u8); key_buffer.extend_from_slice(w1.as_bytes()); @@ -136,7 +207,7 @@ fn document_word_positions_into_sorter( key_buffer.extend_from_slice(w2.as_bytes()); word_pair_proximity_docids_sorters[*prox as usize - 1] - .insert(&key_buffer, document_id.to_ne_bytes())?; + .insert(&key_buffer, value_writer.into_inner().unwrap())?; } Ok(()) @@ -144,7 +215,7 @@ fn document_word_positions_into_sorter( fn word_positions_into_word_pair_proximity( word_positions: &mut VecDeque<(String, u16)>, - word_pair_proximity: &mut HashMap<(String, String), u8>, + word_pair_proximity: &mut BTreeMap<(String, String), u8>, ) -> Result<()> { let (head_word, head_position) = word_positions.pop_front().unwrap(); for (word, position) in word_positions.iter() { From 40186bf4033383ed1acc287b93ee01b7d7162d0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 19 Oct 2023 10:38:58 +0200 Subject: [PATCH 026/127] Rename FieldIdWordCountDocids correctly --- milli/src/update/index_documents/extract/mod.rs | 2 +- milli/src/update/index_documents/typed_chunk.rs | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 0522fc93c..7d643d61f 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -167,7 +167,7 @@ pub(crate) fn data_from_obkv_documents( lmdb_writer_sx.clone(), extract_fid_word_count_docids, merge_cbo_roaring_bitmaps, - TypedChunk::FieldIdWordcountDocids, + TypedChunk::FieldIdWordCountDocids, "field-id-wordcount-docids", ); diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index e3ff9b253..2e7266db0 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -27,7 +27,7 @@ pub(crate) enum TypedChunk { FieldIdDocidFacetStrings(grenad::Reader), FieldIdDocidFacetNumbers(grenad::Reader), Documents(grenad::Reader), - FieldIdWordcountDocids(grenad::Reader>), + FieldIdWordCountDocids(grenad::Reader>), NewDocumentsIds(RoaringBitmap), WordDocids { word_docids_reader: grenad::Reader>, @@ -58,7 +58,7 @@ impl TypedChunk { TypedChunk::Documents(grenad) => { format!("Documents {{ number_of_entries: {} }}", grenad.len()) } - TypedChunk::FieldIdWordcountDocids(grenad) => { + TypedChunk::FieldIdWordCountDocids(grenad) => { format!("FieldIdWordcountDocids {{ number_of_entries: {} }}", grenad.len()) } TypedChunk::NewDocumentsIds(grenad) => { @@ -126,7 +126,7 @@ pub(crate) fn write_typed_chunk_into_index( index.documents.remap_types::().put(wtxn, key, value)?; } } - TypedChunk::FieldIdWordcountDocids(fid_word_count_docids_iter) => { + TypedChunk::FieldIdWordCountDocids(fid_word_count_docids_iter) => { append_entries_into_database( fid_word_count_docids_iter, &index.field_id_word_count_docids, @@ -478,7 +478,7 @@ where while let Some((key, value)) = cursor.move_on_next()? { if valid_lmdb_key(key) { debug_assert!( - K::bytes_decode(&key).is_some(), + K::bytes_decode(key).is_some(), "Couldn't decode key with the database decoder, key length: {} - key bytes: {:x?}", key.len(), &key From 2d3f15f82c4f6104aeba9199b8a71f6924de45fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 19 Oct 2023 10:47:00 +0200 Subject: [PATCH 027/127] Introduce a function to only serialize the Add side of a DelAdd obkv --- .../src/update/index_documents/typed_chunk.rs | 31 +++++++++++++------ 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 2e7266db0..e0e2ff1ec 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -19,6 +19,7 @@ use crate::distance::NDotProductPoint; use crate::error::UserError; use crate::facet::FacetType; use crate::index::Hnsw; +use crate::update::del_add::{DelAdd, KvReaderDelAdd}; use crate::update::facet::FacetsUpdate; use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at}; use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, BEU32}; @@ -132,7 +133,7 @@ pub(crate) fn write_typed_chunk_into_index( &index.field_id_word_count_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), + deladd_serialize_add_side, merge_cbo_roaring_bitmaps, )?; is_merged_database = true; @@ -151,7 +152,7 @@ pub(crate) fn write_typed_chunk_into_index( &index.word_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), + deladd_serialize_add_side, merge_cbo_roaring_bitmaps, )?; @@ -161,7 +162,7 @@ pub(crate) fn write_typed_chunk_into_index( &index.exact_word_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), + deladd_serialize_add_side, merge_cbo_roaring_bitmaps, )?; @@ -171,7 +172,7 @@ pub(crate) fn write_typed_chunk_into_index( &index.word_fid_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), + deladd_serialize_add_side, merge_cbo_roaring_bitmaps, )?; @@ -193,7 +194,7 @@ pub(crate) fn write_typed_chunk_into_index( &index.word_position_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), + deladd_serialize_add_side, merge_cbo_roaring_bitmaps, )?; is_merged_database = true; @@ -214,7 +215,7 @@ pub(crate) fn write_typed_chunk_into_index( &index.facet_id_exists_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), + deladd_serialize_add_side, merge_cbo_roaring_bitmaps, )?; is_merged_database = true; @@ -225,7 +226,7 @@ pub(crate) fn write_typed_chunk_into_index( &index.facet_id_is_null_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), + deladd_serialize_add_side, merge_cbo_roaring_bitmaps, )?; is_merged_database = true; @@ -236,7 +237,7 @@ pub(crate) fn write_typed_chunk_into_index( &index.facet_id_is_empty_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), + deladd_serialize_add_side, merge_cbo_roaring_bitmaps, )?; is_merged_database = true; @@ -247,7 +248,7 @@ pub(crate) fn write_typed_chunk_into_index( &index.word_pair_proximity_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), + deladd_serialize_add_side, merge_cbo_roaring_bitmaps, )?; is_merged_database = true; @@ -320,7 +321,7 @@ pub(crate) fn write_typed_chunk_into_index( let found = vector.len(); let expected = *expected_dimensions.get_or_insert(found); if expected != found { - return Err(UserError::InvalidVectorDimensions { expected, found })?; + return Err(UserError::InvalidVectorDimensions { expected, found }.into()); } points.push(NDotProductPoint::new(vector)); @@ -398,6 +399,16 @@ fn merge_cbo_roaring_bitmaps( )?) } +/// A function that extracts and returns the Add side of a DelAdd obkv. +/// This is useful when there are no previous value in the database and +/// therefore we don't need to do a diff with what's already there. +/// +/// If there is no Add side we currently write an empty buffer +/// which is a valid CboRoaringBitmap. +fn deladd_serialize_add_side<'a>(obkv: &'a [u8], _buffer: &mut Vec) -> Result<&'a [u8]> { + Ok(KvReaderDelAdd::new(obkv).get(DelAdd::Addition).unwrap_or_default()) +} + /// Write provided entries in database using serialize_value function. /// merge_values function is used if an entry already exist in the database. fn write_entries_into_database( From 560e8f56135f14e3a0be3bafccc917647bb87c98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 19 Oct 2023 11:18:30 +0200 Subject: [PATCH 028/127] Introduce the CboRoaringBitmapCodec merge_deladd_into and use it --- .../cbo_roaring_bitmap_codec.rs | 23 ++++++++++ .../src/update/index_documents/typed_chunk.rs | 45 ++++++++++--------- 2 files changed, 48 insertions(+), 20 deletions(-) diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index 79b52695e..117da1308 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -6,6 +6,7 @@ use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt}; use roaring::RoaringBitmap; use crate::heed_codec::BytesDecodeOwned; +use crate::update::del_add::{DelAdd, KvReaderDelAdd}; /// This is the limit where using a byteorder became less size efficient /// than using a direct roaring encoding, it is also the point where we are able @@ -99,6 +100,28 @@ impl CboRoaringBitmapCodec { Ok(()) } + + /// Merges a DelAdd delta into a CboRoaringBitmap. + pub fn merge_deladd_into( + deladd: KvReaderDelAdd<'_>, + previous: &[u8], + buffer: &mut Vec, + ) -> io::Result<()> { + // Deserialize the bitmap that is already there + let mut previous = Self::deserialize_from(previous)?; + + // Remove integers we no more want in the previous bitmap + if let Some(value) = deladd.get(DelAdd::Deletion) { + previous -= Self::deserialize_from(value)?; + } + + // Insert the new integers we want in the previous bitmap + if let Some(value) = deladd.get(DelAdd::Addition) { + previous |= Self::deserialize_from(value)?; + } + + previous.serialize_into(buffer) + } } impl heed::BytesDecode<'_> for CboRoaringBitmapCodec { diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index e0e2ff1ec..faeee944f 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -134,7 +134,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, )?; is_merged_database = true; } @@ -153,7 +153,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, )?; let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?; @@ -163,7 +163,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, )?; let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?; @@ -173,7 +173,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, )?; // create fst from word docids @@ -195,7 +195,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, )?; is_merged_database = true; } @@ -216,7 +216,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, )?; is_merged_database = true; } @@ -227,7 +227,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, )?; is_merged_database = true; } @@ -238,7 +238,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, )?; is_merged_database = true; } @@ -249,7 +249,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, )?; is_merged_database = true; } @@ -388,17 +388,6 @@ fn merge_word_docids_reader_into_fst( Ok(builder.into_set()) } -fn merge_cbo_roaring_bitmaps( - new_value: &[u8], - db_value: &[u8], - buffer: &mut Vec, -) -> Result<()> { - Ok(CboRoaringBitmapCodec::merge_into( - &[Cow::Borrowed(db_value), Cow::Borrowed(new_value)], - buffer, - )?) -} - /// A function that extracts and returns the Add side of a DelAdd obkv. /// This is useful when there are no previous value in the database and /// therefore we don't need to do a diff with what's already there. @@ -409,6 +398,22 @@ fn deladd_serialize_add_side<'a>(obkv: &'a [u8], _buffer: &mut Vec) -> Resul Ok(KvReaderDelAdd::new(obkv).get(DelAdd::Addition).unwrap_or_default()) } +/// A function that merges a DelAdd of bitmao into an already existing bitmap. +/// +/// The first argument is the DelAdd obkv of CboRoaringBitmaps and +/// the second one is the CboRoaringBitmap to merge into. +fn merge_deladd_cbo_roaring_bitmaps( + deladd_obkv: &[u8], + previous: &[u8], + buffer: &mut Vec, +) -> Result<()> { + Ok(CboRoaringBitmapCodec::merge_deladd_into( + KvReaderDelAdd::new(deladd_obkv), + previous, + buffer, + )?) +} + /// Write provided entries in database using serialize_value function. /// merge_values function is used if an entry already exist in the database. fn write_entries_into_database( From f67ff3a738374dc957a676707cb9f5214cb64629 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 19 Oct 2023 11:56:42 +0200 Subject: [PATCH 029/127] Facets Bulk update --- milli/src/update/facet/bulk.rs | 46 ++++++++++++++++++++++------------ milli/src/update/facet/mod.rs | 16 ++++++------ 2 files changed, 38 insertions(+), 24 deletions(-) diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index a2b1c9dcd..40b64fc25 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -2,9 +2,10 @@ use std::borrow::Cow; use std::fs::File; use std::io::BufReader; -use grenad::CompressionType; +use grenad::{CompressionType, Reader}; use heed::types::ByteSlice; use heed::{BytesEncode, Error, RoTxn, RwTxn}; +use obkv::KvReader; use roaring::RoaringBitmap; use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; @@ -13,6 +14,7 @@ use crate::heed_codec::facet::{ FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; use crate::heed_codec::ByteSliceRefCodec; +use crate::update::del_add::DelAdd; use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader}; use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; @@ -31,7 +33,7 @@ pub struct FacetsUpdateBulk<'i> { facet_type: FacetType, field_ids: Vec, // None if level 0 does not need to be updated - new_data: Option>>, + delta_data: Option>>, } impl<'i> FacetsUpdateBulk<'i> { @@ -39,7 +41,7 @@ impl<'i> FacetsUpdateBulk<'i> { index: &'i Index, field_ids: Vec, facet_type: FacetType, - new_data: grenad::Reader>, + delta_data: grenad::Reader>, group_size: u8, min_level_size: u8, ) -> FacetsUpdateBulk<'i> { @@ -49,7 +51,7 @@ impl<'i> FacetsUpdateBulk<'i> { group_size, min_level_size, facet_type, - new_data: Some(new_data), + delta_data: Some(delta_data), } } @@ -64,13 +66,13 @@ impl<'i> FacetsUpdateBulk<'i> { group_size: FACET_GROUP_SIZE, min_level_size: FACET_MIN_LEVEL_SIZE, facet_type, - new_data: None, + delta_data: None, } } #[logging_timer::time("FacetsUpdateBulk::{}")] pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { - let Self { index, field_ids, group_size, min_level_size, facet_type, new_data } = self; + let Self { index, field_ids, group_size, min_level_size, facet_type, delta_data } = self; let db = match facet_type { FacetType::String => index @@ -81,7 +83,7 @@ impl<'i> FacetsUpdateBulk<'i> { } }; - let inner = FacetsUpdateBulkInner { db, new_data, group_size, min_level_size }; + let inner = FacetsUpdateBulkInner { db, delta_data, group_size, min_level_size }; inner.update(wtxn, &field_ids, |wtxn, field_id, all_docids| { index.put_faceted_documents_ids(wtxn, field_id, facet_type, &all_docids)?; @@ -95,7 +97,7 @@ impl<'i> FacetsUpdateBulk<'i> { /// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type pub(crate) struct FacetsUpdateBulkInner { pub db: heed::Database, FacetGroupValueCodec>, - pub new_data: Option>, + pub delta_data: Option>, pub group_size: u8, pub min_level_size: u8, } @@ -134,20 +136,26 @@ impl FacetsUpdateBulkInner { Ok(()) } - // TODO the new_data is an Reader>> fn update_level0(&mut self, wtxn: &mut RwTxn) -> Result<()> { - let new_data = match self.new_data.take() { + let delta_data = match self.delta_data.take() { Some(x) => x, None => return Ok(()), }; if self.db.is_empty(wtxn)? { let mut buffer = Vec::new(); let mut database = self.db.iter_mut(wtxn)?.remap_types::(); - let mut cursor = new_data.into_cursor()?; + let mut cursor = delta_data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { if !valid_lmdb_key(key) { continue; } + let value: KvReader = KvReader::new(value); + + // DB is empty, it is safe to ignore Del operations + let Some(value) = value.get(DelAdd::Addition) else { + continue; + }; + buffer.clear(); // the group size for level 0 buffer.push(1); @@ -159,11 +167,14 @@ impl FacetsUpdateBulkInner { let mut buffer = Vec::new(); let database = self.db.remap_types::(); - let mut cursor = new_data.into_cursor()?; + let mut cursor = delta_data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { if !valid_lmdb_key(key) { continue; } + + let value: KvReader = KvReader::new(value); + // the value is a CboRoaringBitmap, but I still need to prepend the // group size for level 0 (= 1) to it buffer.clear(); @@ -172,12 +183,15 @@ impl FacetsUpdateBulkInner { match database.get(wtxn, key)? { Some(prev_value) => { let old_bitmap = &prev_value[1..]; - CboRoaringBitmapCodec::merge_into( - &[Cow::Borrowed(value), Cow::Borrowed(old_bitmap)], - &mut buffer, - )?; + CboRoaringBitmapCodec::merge_deladd_into(value, old_bitmap, &mut buffer)?; } None => { + // it is safe to ignore the del in that case. + let Some(value) = value.get(DelAdd::Addition) else { + // won't put the key in DB as the value would be empty + continue; + }; + buffer.extend_from_slice(value); } }; diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index decb6a9ac..c016af354 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -109,7 +109,7 @@ pub struct FacetsUpdate<'i> { index: &'i Index, database: heed::Database, FacetGroupValueCodec>, facet_type: FacetType, - new_data: grenad::Reader>, + delta_data: grenad::Reader>, group_size: u8, max_group_size: u8, min_level_size: u8, @@ -119,7 +119,7 @@ impl<'i> FacetsUpdate<'i> { pub fn new( index: &'i Index, facet_type: FacetType, - new_data: grenad::Reader>, + delta_data: grenad::Reader>, ) -> Self { let database = match facet_type { FacetType::String => index @@ -136,26 +136,26 @@ impl<'i> FacetsUpdate<'i> { max_group_size: FACET_MAX_GROUP_SIZE, min_level_size: FACET_MIN_LEVEL_SIZE, facet_type, - new_data, + delta_data, } } pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { - if self.new_data.is_empty() { + if self.delta_data.is_empty() { return Ok(()); } debug!("Computing and writing the facet values levels docids into LMDB on disk..."); self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; // See self::comparison_bench::benchmark_facet_indexing - if self.new_data.len() >= (self.database.len(wtxn)? as u64 / 50) { + if self.delta_data.len() >= (self.database.len(wtxn)? as u64 / 50) { let field_ids = self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::>(); let bulk_update = FacetsUpdateBulk::new( self.index, field_ids, self.facet_type, - self.new_data, + self.delta_data, self.group_size, self.min_level_size, ); @@ -164,7 +164,7 @@ impl<'i> FacetsUpdate<'i> { let incremental_update = FacetsUpdateIncremental::new( self.index, self.facet_type, - self.new_data, + self.delta_data, self.group_size, self.min_level_size, self.max_group_size, @@ -464,7 +464,7 @@ pub(crate) mod test_helpers { let update = FacetsUpdateBulkInner { db: self.content, - new_data: Some(reader), + delta_data: Some(reader), group_size: self.group_size.get(), min_level_size: self.min_level_size.get(), }; From 04ec293024191deabe4725427230f78f01af72af Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 19 Oct 2023 12:01:12 +0200 Subject: [PATCH 030/127] Facet Incremental update --- milli/src/update/facet/incremental.rs | 77 +++++++++++++++++++++++---- 1 file changed, 66 insertions(+), 11 deletions(-) diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 743c0b038..802c02b85 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -4,6 +4,7 @@ use std::io::BufReader; use heed::types::{ByteSlice, DecodeIgnore}; use heed::{BytesDecode, Error, RoTxn, RwTxn}; +use obkv::KvReader; use roaring::RoaringBitmap; use crate::facet::FacetType; @@ -12,6 +13,7 @@ use crate::heed_codec::facet::{ }; use crate::heed_codec::ByteSliceRefCodec; use crate::search::facet::get_highest_level; +use crate::update::del_add::DelAdd; use crate::update::index_documents::valid_lmdb_key; use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; @@ -35,14 +37,14 @@ pub struct FacetsUpdateIncremental<'i> { index: &'i Index, inner: FacetsUpdateIncrementalInner, facet_type: FacetType, - new_data: grenad::Reader>, + delta_data: grenad::Reader>, } impl<'i> FacetsUpdateIncremental<'i> { pub fn new( index: &'i Index, facet_type: FacetType, - new_data: grenad::Reader>, + delta_data: grenad::Reader>, group_size: u8, min_level_size: u8, max_group_size: u8, @@ -63,29 +65,82 @@ impl<'i> FacetsUpdateIncremental<'i> { min_level_size, }, facet_type, - new_data, + delta_data, } } pub fn execute(self, wtxn: &'i mut RwTxn) -> crate::Result<()> { - let mut new_faceted_docids = HashMap::::default(); + #[derive(Default)] + struct DeltaDocids { + deleted: RoaringBitmap, + added: RoaringBitmap, + } + impl DeltaDocids { + fn add(&mut self, added: &RoaringBitmap) { + self.deleted -= added; + self.added |= added; + } + fn delete(&mut self, deleted: &RoaringBitmap) { + self.deleted |= deleted; + self.added -= deleted; + } + fn applied(self, mut docids: RoaringBitmap) -> RoaringBitmap { + docids -= self.deleted; + docids |= self.added; + docids + } + } - let mut cursor = self.new_data.into_cursor()?; + let mut new_faceted_docids = HashMap::::default(); + + let mut cursor = self.delta_data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { if !valid_lmdb_key(key) { continue; } let key = FacetGroupKeyCodec::::bytes_decode(key) .ok_or(heed::Error::Encoding)?; - let docids = CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; - self.inner.insert(wtxn, key.field_id, key.left_bound, &docids)?; - *new_faceted_docids.entry(key.field_id).or_default() |= docids; + let value = KvReader::new(value); + + let entry = new_faceted_docids.entry(key.field_id).or_default(); + + let docids_to_delete = value + .get(DelAdd::Deletion) + .map(CboRoaringBitmapCodec::bytes_decode) + .map(|o| o.ok_or(heed::Error::Encoding)); + + let docids_to_add = value + .get(DelAdd::Addition) + .map(CboRoaringBitmapCodec::bytes_decode) + .map(|o| o.ok_or(heed::Error::Encoding)); + + if let Some(docids_to_delete) = docids_to_delete { + let docids_to_delete = docids_to_delete?; + self.inner.delete(wtxn, key.field_id, key.left_bound, &docids_to_delete)?; + entry.delete(&docids_to_delete); + } + + if let Some(docids_to_add) = docids_to_add { + let docids_to_add = docids_to_add?; + self.inner.insert(wtxn, key.field_id, key.left_bound, &docids_to_add)?; + entry.add(&docids_to_add); + } } + // FIXME: broken for multi-value facets? + // + // Consider an incremental update: `facet="tags", facet_value="Action", {Del: Some([0, 1]), Add: None }` + // The current code will inconditionally remove docs 0 and 1 from faceted docs for "tags". + // Now for doc 0: `"tags": "Action"`, it's correct behavior + // for doc 1: `"tags": "Action, Adventure"`, it's incorrect behavior for (field_id, new_docids) in new_faceted_docids { - let mut docids = self.index.faceted_documents_ids(wtxn, field_id, self.facet_type)?; - docids |= new_docids; - self.index.put_faceted_documents_ids(wtxn, field_id, self.facet_type, &docids)?; + let old_docids = self.index.faceted_documents_ids(wtxn, field_id, self.facet_type)?; + self.index.put_faceted_documents_ids( + wtxn, + field_id, + self.facet_type, + &new_docids.applied(old_docids), + )?; } Ok(()) } From 14832cb32414b28b10ac778bcf4794fccc748b67 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 23 Oct 2023 14:50:11 +0200 Subject: [PATCH 031/127] Remove Index::faceted_documents_ids --- milli/src/index.rs | 40 ---------------------- milli/src/snapshot_tests.rs | 30 ----------------- milli/src/update/clear_documents.rs | 16 --------- milli/src/update/delete_documents.rs | 6 ---- milli/src/update/facet/bulk.rs | 6 +--- milli/src/update/facet/delete.rs | 11 ------ milli/src/update/facet/incremental.rs | 45 ------------------------- milli/src/update/facet/mod.rs | 3 -- milli/src/update/index_documents/mod.rs | 18 ---------- 9 files changed, 1 insertion(+), 174 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 288223a95..f8be55545 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -55,7 +55,6 @@ pub mod main_key { /// e.g. vector-hnsw0x0032. pub const VECTOR_HNSW_KEY_PREFIX: &str = "vector-hnsw"; pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; - pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids"; pub const PRIMARY_KEY_KEY: &str = "primary-key"; pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields"; pub const USER_DEFINED_SEARCHABLE_FIELDS_KEY: &str = "user-defined-searchable-fields"; @@ -64,7 +63,6 @@ pub mod main_key { pub const NON_SEPARATOR_TOKENS_KEY: &str = "non-separator-tokens"; pub const SEPARATOR_TOKENS_KEY: &str = "separator-tokens"; pub const DICTIONARY_KEY: &str = "dictionary"; - pub const STRING_FACETED_DOCUMENTS_IDS_PREFIX: &str = "string-faceted-documents-ids"; pub const SYNONYMS_KEY: &str = "synonyms"; pub const USER_DEFINED_SYNONYMS_KEY: &str = "user-defined-synonyms"; pub const WORDS_FST_KEY: &str = "words-fst"; @@ -926,44 +924,6 @@ impl Index { /* faceted documents ids */ - /// Writes the documents ids that are faceted under this field id for the given facet type. - pub fn put_faceted_documents_ids( - &self, - wtxn: &mut RwTxn, - field_id: FieldId, - facet_type: FacetType, - docids: &RoaringBitmap, - ) -> heed::Result<()> { - let key = match facet_type { - FacetType::String => main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX, - FacetType::Number => main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX, - }; - let mut buffer = vec![0u8; key.len() + size_of::()]; - buffer[..key.len()].copy_from_slice(key.as_bytes()); - buffer[key.len()..].copy_from_slice(&field_id.to_be_bytes()); - self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) - } - - /// Retrieve all the documents ids that are faceted under this field id for the given facet type. - pub fn faceted_documents_ids( - &self, - rtxn: &RoTxn, - field_id: FieldId, - facet_type: FacetType, - ) -> heed::Result { - let key = match facet_type { - FacetType::String => main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX, - FacetType::Number => main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX, - }; - let mut buffer = vec![0u8; key.len() + size_of::()]; - buffer[..key.len()].copy_from_slice(key.as_bytes()); - buffer[key.len()..].copy_from_slice(&field_id.to_be_bytes()); - match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { - Some(docids) => Ok(docids), - None => Ok(RoaringBitmap::new()), - } - } - /// Retrieve all the documents which contain this field id set as null pub fn null_faceted_documents_ids( &self, diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index 158f515b8..4b21cc175 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -359,31 +359,7 @@ pub fn snap_external_documents_ids(index: &Index) -> String { snap } -pub fn snap_number_faceted_documents_ids(index: &Index) -> String { - let rtxn = index.read_txn().unwrap(); - let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let mut snap = String::new(); - for field_id in fields_ids_map.ids() { - let number_faceted_documents_ids = - index.faceted_documents_ids(&rtxn, field_id, FacetType::Number).unwrap(); - writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&number_faceted_documents_ids)) - .unwrap(); - } - snap -} -pub fn snap_string_faceted_documents_ids(index: &Index) -> String { - let rtxn = index.read_txn().unwrap(); - let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let mut snap = String::new(); - for field_id in fields_ids_map.ids() { - let string_faceted_documents_ids = - index.faceted_documents_ids(&rtxn, field_id, FacetType::String).unwrap(); - writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&string_faceted_documents_ids)) - .unwrap(); - } - snap -} pub fn snap_words_fst(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); let words_fst = index.words_fst(&rtxn).unwrap(); @@ -531,12 +507,6 @@ macro_rules! full_snap_of_db { ($index:ident, external_documents_ids) => {{ $crate::snapshot_tests::snap_external_documents_ids(&$index) }}; - ($index:ident, number_faceted_documents_ids) => {{ - $crate::snapshot_tests::snap_number_faceted_documents_ids(&$index) - }}; - ($index:ident, string_faceted_documents_ids) => {{ - $crate::snapshot_tests::snap_string_faceted_documents_ids(&$index) - }}; ($index:ident, words_fst) => {{ $crate::snapshot_tests::snap_words_fst(&$index) }}; diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index ab42fd854..52f3e80db 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -64,22 +64,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { self.index.delete_geo_faceted_documents_ids(self.wtxn)?; self.index.delete_vector_hnsw(self.wtxn)?; - // We clean all the faceted documents ids. - for field_id in faceted_fields { - self.index.put_faceted_documents_ids( - self.wtxn, - field_id, - FacetType::Number, - &empty_roaring, - )?; - self.index.put_faceted_documents_ids( - self.wtxn, - field_id, - FacetType::String, - &empty_roaring, - )?; - } - // Clear the other databases. word_docids.clear(self.wtxn)?; exact_word_docids.clear(self.wtxn)?; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 1fef922cd..9044f03be 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -384,12 +384,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { for facet_type in [FacetType::Number, FacetType::String] { let mut affected_facet_values = HashMap::new(); for field_id in self.index.faceted_fields_ids(self.wtxn)? { - // Remove docids from the number faceted documents ids - let mut docids = - self.index.faceted_documents_ids(self.wtxn, field_id, facet_type)?; - docids -= &self.to_delete_docids; - self.index.put_faceted_documents_ids(self.wtxn, field_id, facet_type, &docids)?; - let facet_values = remove_docids_from_field_id_docid_facet_value( self.index, self.wtxn, diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 40b64fc25..5247298a4 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -23,9 +23,6 @@ use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; /// /// First, the new elements are inserted into the level 0 of the database. Then, the /// higher levels are cleared and recomputed from the content of level 0. -/// -/// Finally, the `faceted_documents_ids` value in the main database of `Index` -/// is updated to contain the new set of faceted documents. pub struct FacetsUpdateBulk<'i> { index: &'i Index, group_size: u8, @@ -86,7 +83,7 @@ impl<'i> FacetsUpdateBulk<'i> { let inner = FacetsUpdateBulkInner { db, delta_data, group_size, min_level_size }; inner.update(wtxn, &field_ids, |wtxn, field_id, all_docids| { - index.put_faceted_documents_ids(wtxn, field_id, facet_type, &all_docids)?; + // TODO: remove the lambda altogether Ok(()) })?; @@ -507,7 +504,6 @@ mod tests { index.add_documents(documents).unwrap(); db_snap!(index, facet_id_f64_docids, "initial", @"c34f499261f3510d862fa0283bbe843a"); - db_snap!(index, number_faceted_documents_ids, "initial", @"01594fecbb316798ce3651d6730a4521"); } #[test] diff --git a/milli/src/update/facet/delete.rs b/milli/src/update/facet/delete.rs index 883abc8ca..8bd3f196b 100644 --- a/milli/src/update/facet/delete.rs +++ b/milli/src/update/facet/delete.rs @@ -160,7 +160,6 @@ mod tests { index.add_documents(documents).unwrap(); db_snap!(index, facet_id_f64_docids, 1, @"550cd138d6fe31ccdd42cd5392fbd576"); - db_snap!(index, number_faceted_documents_ids, 1, @"9a0ea88e7c9dcf6dc0ef0b601736ffcf"); let mut wtxn = index.env.write_txn().unwrap(); @@ -178,7 +177,6 @@ mod tests { db_snap!(index, soft_deleted_documents_ids, @"[]"); db_snap!(index, facet_id_f64_docids, 2, @"d4d5f14e7f1e1f09b86821a0b6defcc6"); - db_snap!(index, number_faceted_documents_ids, 2, @"3570e0ac0fdb21be9ebe433f59264b56"); } // Same test as above but working with string values for the facets @@ -219,7 +217,6 @@ mod tests { // Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022) db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503"); - db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5"); let mut wtxn = index.env.write_txn().unwrap(); @@ -237,7 +234,6 @@ mod tests { db_snap!(index, soft_deleted_documents_ids, @"[]"); db_snap!(index, facet_id_string_docids, 2, @"7f9c00b29e04d58c1821202a5dda0ebc"); - db_snap!(index, string_faceted_documents_ids, 2, @"504152afa5c94fd4e515dcdfa4c7161f"); } #[test] @@ -274,7 +270,6 @@ mod tests { // Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022) db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503"); - db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5"); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); @@ -291,12 +286,6 @@ mod tests { db_snap!(index, soft_deleted_documents_ids, @"[]"); db_snap!(index, facet_id_string_docids, 2, @"ece56086e76d50e661fb2b58475b9f7d"); - db_snap!(index, string_faceted_documents_ids, 2, @r###" - 0 [] - 1 [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ] - 2 [292, 324, 358, 381, 493, 839, 852, ] - 3 [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ] - "###); } } diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 802c02b85..77e9874f6 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -30,9 +30,6 @@ enum DeletionResult { /// Algorithm to incrementally insert and delete elememts into the /// `facet_id_(string/f64)_docids` databases. -/// -/// Rhe `faceted_documents_ids` value in the main database of `Index` -/// is also updated to contain the new set of faceted documents. pub struct FacetsUpdateIncremental<'i> { index: &'i Index, inner: FacetsUpdateIncrementalInner, @@ -70,29 +67,6 @@ impl<'i> FacetsUpdateIncremental<'i> { } pub fn execute(self, wtxn: &'i mut RwTxn) -> crate::Result<()> { - #[derive(Default)] - struct DeltaDocids { - deleted: RoaringBitmap, - added: RoaringBitmap, - } - impl DeltaDocids { - fn add(&mut self, added: &RoaringBitmap) { - self.deleted -= added; - self.added |= added; - } - fn delete(&mut self, deleted: &RoaringBitmap) { - self.deleted |= deleted; - self.added -= deleted; - } - fn applied(self, mut docids: RoaringBitmap) -> RoaringBitmap { - docids -= self.deleted; - docids |= self.added; - docids - } - } - - let mut new_faceted_docids = HashMap::::default(); - let mut cursor = self.delta_data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { if !valid_lmdb_key(key) { @@ -102,8 +76,6 @@ impl<'i> FacetsUpdateIncremental<'i> { .ok_or(heed::Error::Encoding)?; let value = KvReader::new(value); - let entry = new_faceted_docids.entry(key.field_id).or_default(); - let docids_to_delete = value .get(DelAdd::Deletion) .map(CboRoaringBitmapCodec::bytes_decode) @@ -117,31 +89,14 @@ impl<'i> FacetsUpdateIncremental<'i> { if let Some(docids_to_delete) = docids_to_delete { let docids_to_delete = docids_to_delete?; self.inner.delete(wtxn, key.field_id, key.left_bound, &docids_to_delete)?; - entry.delete(&docids_to_delete); } if let Some(docids_to_add) = docids_to_add { let docids_to_add = docids_to_add?; self.inner.insert(wtxn, key.field_id, key.left_bound, &docids_to_add)?; - entry.add(&docids_to_add); } } - // FIXME: broken for multi-value facets? - // - // Consider an incremental update: `facet="tags", facet_value="Action", {Del: Some([0, 1]), Add: None }` - // The current code will inconditionally remove docs 0 and 1 from faceted docs for "tags". - // Now for doc 0: `"tags": "Action"`, it's correct behavior - // for doc 1: `"tags": "Action, Adventure"`, it's incorrect behavior - for (field_id, new_docids) in new_faceted_docids { - let old_docids = self.index.faceted_documents_ids(wtxn, field_id, self.facet_type)?; - self.index.put_faceted_documents_ids( - wtxn, - field_id, - self.facet_type, - &new_docids.applied(old_docids), - )?; - } Ok(()) } } diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index c016af354..e3c632983 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -599,7 +599,6 @@ mod tests { index.add_documents(documents).unwrap(); db_snap!(index, facet_id_f64_docids, "initial", @"777e0e221d778764b472c512617eeb3b"); - db_snap!(index, number_faceted_documents_ids, "initial", @"bd916ef32b05fd5c3c4c518708f431a9"); db_snap!(index, soft_deleted_documents_ids, "initial", @"[]"); let mut documents = vec![]; @@ -622,7 +621,6 @@ mod tests { index.add_documents(documents).unwrap(); db_snap!(index, facet_id_f64_docids, "replaced_1_soft", @"abba175d7bed727d0efadaef85a4388f"); - db_snap!(index, number_faceted_documents_ids, "replaced_1_soft", @"de76488bd05ad94c6452d725acf1bd06"); db_snap!(index, soft_deleted_documents_ids, "replaced_1_soft", @"6c975deb900f286d2f6456d2d5c3a123"); // Then replace the last document while disabling soft_deletion @@ -647,7 +645,6 @@ mod tests { index.add_documents(documents).unwrap(); db_snap!(index, facet_id_f64_docids, "replaced_2_hard", @"029e27a46d09c574ae949aa4289b45e6"); - db_snap!(index, number_faceted_documents_ids, "replaced_2_hard", @"60b19824f136affe6b240a7200779028"); db_snap!(index, soft_deleted_documents_ids, "replaced_2_hard", @"[]"); } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 703d7ee29..27021c3fb 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1499,12 +1499,6 @@ mod tests { 3 2 second second 3 3 third third "###); - db_snap!(index, string_faceted_documents_ids, @r###" - 0 [] - 1 [] - 2 [] - 3 [0, 1, 2, 3, ] - "###); let rtxn = index.read_txn().unwrap(); @@ -1528,12 +1522,6 @@ mod tests { db_snap!(index, facet_id_string_docids, @""); db_snap!(index, field_id_docid_facet_strings, @""); - db_snap!(index, string_faceted_documents_ids, @r###" - 0 [] - 1 [] - 2 [] - 3 [0, 1, 2, 3, ] - "###); let rtxn = index.read_txn().unwrap(); @@ -1560,12 +1548,6 @@ mod tests { 3 2 second second 3 3 third third "###); - db_snap!(index, string_faceted_documents_ids, @r###" - 0 [] - 1 [] - 2 [] - 3 [0, 1, 2, 3, ] - "###); let rtxn = index.read_txn().unwrap(); From 59f88c14b3087eb78324d36e679bb4f64799f277 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 23 Oct 2023 15:19:33 +0200 Subject: [PATCH 032/127] Simplify facet update after removing `Index::faceted_documents_ids` --- milli/src/index.rs | 2 -- milli/src/update/clear_documents.rs | 2 -- milli/src/update/facet/bulk.rs | 31 +++++-------------- milli/src/update/facet/incremental.rs | 15 +++------ milli/src/update/facet/mod.rs | 1 - .../src/update/index_documents/typed_chunk.rs | 5 +-- 6 files changed, 13 insertions(+), 43 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index f8be55545..eb9e153ec 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1,7 +1,6 @@ use std::borrow::Cow; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::fs::File; -use std::mem::size_of; use std::path::Path; use charabia::{Language, Script}; @@ -14,7 +13,6 @@ use time::OffsetDateTime; use crate::distance::NDotProductPoint; use crate::error::{InternalError, UserError}; -use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; use crate::heed_codec::facet::{ FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 52f3e80db..3eb7e0910 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -1,7 +1,6 @@ use roaring::RoaringBitmap; use time::OffsetDateTime; -use crate::facet::FacetType; use crate::{ExternalDocumentsIds, FieldDistribution, Index, Result}; pub struct ClearDocuments<'t, 'u, 'i> { @@ -51,7 +50,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { // We retrieve the number of documents ids that we are deleting. let number_of_documents = self.index.number_of_documents(self.wtxn)?; - let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; // We clean some of the main engine datastructures. self.index.put_words_fst(self.wtxn, &fst::Set::default())?; diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 5247298a4..d2205f9d6 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -1,8 +1,7 @@ -use std::borrow::Cow; use std::fs::File; use std::io::BufReader; -use grenad::{CompressionType, Reader}; +use grenad::CompressionType; use heed::types::ByteSlice; use heed::{BytesEncode, Error, RoTxn, RwTxn}; use obkv::KvReader; @@ -82,10 +81,7 @@ impl<'i> FacetsUpdateBulk<'i> { let inner = FacetsUpdateBulkInner { db, delta_data, group_size, min_level_size }; - inner.update(wtxn, &field_ids, |wtxn, field_id, all_docids| { - // TODO: remove the lambda altogether - Ok(()) - })?; + inner.update(wtxn, &field_ids)?; Ok(()) } @@ -99,21 +95,14 @@ pub(crate) struct FacetsUpdateBulkInner { pub min_level_size: u8, } impl FacetsUpdateBulkInner { - pub fn update( - mut self, - wtxn: &mut RwTxn, - field_ids: &[u16], - mut handle_all_docids: impl FnMut(&mut RwTxn, FieldId, RoaringBitmap) -> Result<()>, - ) -> Result<()> { + pub fn update(mut self, wtxn: &mut RwTxn, field_ids: &[u16]) -> Result<()> { self.update_level0(wtxn)?; for &field_id in field_ids.iter() { self.clear_levels(wtxn, field_id)?; } for &field_id in field_ids.iter() { - let (level_readers, all_docids) = self.compute_levels_for_field_id(field_id, wtxn)?; - - handle_all_docids(wtxn, field_id, all_docids)?; + let level_readers = self.compute_levels_for_field_id(field_id, wtxn)?; for level_reader in level_readers { let mut cursor = level_reader.into_cursor()?; @@ -201,16 +190,10 @@ impl FacetsUpdateBulkInner { &self, field_id: FieldId, txn: &RoTxn, - ) -> Result<(Vec>>, RoaringBitmap)> { - let mut all_docids = RoaringBitmap::new(); - let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |bitmaps, _| { - for bitmap in bitmaps { - all_docids |= bitmap; - } - Ok(()) - })?; + ) -> Result>>> { + let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |_, _| Ok(()))?; - Ok((subwriters, all_docids)) + Ok(subwriters) } #[allow(clippy::type_complexity)] fn read_level_0<'t>( diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 77e9874f6..e241c499c 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -1,4 +1,3 @@ -use std::collections::HashMap; use std::fs::File; use std::io::BufReader; @@ -15,7 +14,7 @@ use crate::heed_codec::ByteSliceRefCodec; use crate::search::facet::get_highest_level; use crate::update::del_add::DelAdd; use crate::update::index_documents::valid_lmdb_key; -use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; +use crate::{CboRoaringBitmapCodec, Index, Result}; enum InsertionResult { InPlace, @@ -30,16 +29,14 @@ enum DeletionResult { /// Algorithm to incrementally insert and delete elememts into the /// `facet_id_(string/f64)_docids` databases. -pub struct FacetsUpdateIncremental<'i> { - index: &'i Index, +pub struct FacetsUpdateIncremental { inner: FacetsUpdateIncrementalInner, - facet_type: FacetType, delta_data: grenad::Reader>, } -impl<'i> FacetsUpdateIncremental<'i> { +impl FacetsUpdateIncremental { pub fn new( - index: &'i Index, + index: &Index, facet_type: FacetType, delta_data: grenad::Reader>, group_size: u8, @@ -47,7 +44,6 @@ impl<'i> FacetsUpdateIncremental<'i> { max_group_size: u8, ) -> Self { FacetsUpdateIncremental { - index, inner: FacetsUpdateIncrementalInner { db: match facet_type { FacetType::String => index @@ -61,12 +57,11 @@ impl<'i> FacetsUpdateIncremental<'i> { max_group_size, min_level_size, }, - facet_type, delta_data, } } - pub fn execute(self, wtxn: &'i mut RwTxn) -> crate::Result<()> { + pub fn execute(self, wtxn: &mut RwTxn) -> crate::Result<()> { let mut cursor = self.delta_data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { if !valid_lmdb_key(key) { diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index e3c632983..3465e5437 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -115,7 +115,6 @@ pub struct FacetsUpdate<'i> { min_level_size: u8, } impl<'i> FacetsUpdate<'i> { - // TODO grenad::Reader> pub fn new( index: &'i Index, facet_type: FacetType, diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index faeee944f..0d618ad28 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -1,4 +1,3 @@ -use std::borrow::Cow; use std::collections::HashMap; use std::convert::TryInto; use std::fs::File; @@ -11,9 +10,7 @@ use heed::types::ByteSlice; use heed::RwTxn; use roaring::RoaringBitmap; -use super::helpers::{ - self, merge_ignore_values, serialize_roaring_bitmap, valid_lmdb_key, CursorClonableMmap, -}; +use super::helpers::{self, merge_ignore_values, valid_lmdb_key, CursorClonableMmap}; use super::{ClonableMmap, MergeFn}; use crate::distance::NDotProductPoint; use crate::error::UserError; From 66abac9364265da9896a33c24bc76249bb063bcb Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 23 Oct 2023 15:55:35 +0200 Subject: [PATCH 033/127] Use specialized `KvReaderDelAdd` type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/update/facet/bulk.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index d2205f9d6..297d189cd 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -4,7 +4,6 @@ use std::io::BufReader; use grenad::CompressionType; use heed::types::ByteSlice; use heed::{BytesEncode, Error, RoTxn, RwTxn}; -use obkv::KvReader; use roaring::RoaringBitmap; use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; @@ -13,7 +12,7 @@ use crate::heed_codec::facet::{ FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; use crate::heed_codec::ByteSliceRefCodec; -use crate::update::del_add::DelAdd; +use crate::update::del_add::{DelAdd, KvReaderDelAdd}; use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader}; use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; @@ -135,7 +134,7 @@ impl FacetsUpdateBulkInner { if !valid_lmdb_key(key) { continue; } - let value: KvReader = KvReader::new(value); + let value = KvReaderDelAdd::new(value); // DB is empty, it is safe to ignore Del operations let Some(value) = value.get(DelAdd::Addition) else { @@ -159,7 +158,7 @@ impl FacetsUpdateBulkInner { continue; } - let value: KvReader = KvReader::new(value); + let value = KvReaderDelAdd::new(value); // the value is a CboRoaringBitmap, but I still need to prepend the // group size for level 0 (= 1) to it From b26dc9aabe774812dedffb96c3efeb0dfd4252dc Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 23 Oct 2023 16:06:06 +0200 Subject: [PATCH 034/127] Explanatory code comment --- milli/src/update/facet/bulk.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 297d189cd..c0b159e57 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -167,6 +167,7 @@ impl FacetsUpdateBulkInner { // then we extend the buffer with the docids bitmap match database.get(wtxn, key)? { Some(prev_value) => { + // prev_value is the group size for level 0, followed by the previous bitmap. let old_bitmap = &prev_value[1..]; CboRoaringBitmapCodec::merge_deladd_into(value, old_bitmap, &mut buffer)?; } From ba90a5ec0eb11ab99ab933f7fb65930bec93cc6d Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 23 Oct 2023 16:34:49 +0200 Subject: [PATCH 035/127] update extract fid word count docids --- .../extract/extract_fid_word_count_docids.rs | 51 ++++++++++++++++--- 1 file changed, 43 insertions(+), 8 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs index 289a744da..accf4a510 100644 --- a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs +++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs @@ -4,11 +4,12 @@ use std::io::{self, BufReader}; use obkv::KvReaderU16; use super::helpers::{ - create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at, + create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at, GrenadParameters, }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; +use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::Result; const MAX_COUNTED_WORDS: usize = 30; @@ -29,7 +30,7 @@ pub fn extract_fid_word_count_docids( let mut fid_word_count_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, @@ -37,18 +38,52 @@ pub fn extract_fid_word_count_docids( ); let mut key_buffer = Vec::new(); + let mut value_buffer = Vec::new(); let mut cursor = docid_word_positions.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { let (document_id_bytes, fid_bytes) = try_split_array_at(key) .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = u32::from_be_bytes(document_id_bytes); - let word_count = KvReaderU16::new(&value).iter().take(MAX_COUNTED_WORDS + 1).count(); - if word_count <= MAX_COUNTED_WORDS { - key_buffer.clear(); - key_buffer.extend_from_slice(fid_bytes); - key_buffer.push(word_count as u8); - fid_word_count_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; + let del_add_reader = KvReaderDelAdd::new(&value); + let deletion = del_add_reader + // get deleted words + .get(DelAdd::Deletion) + // count deleted words + .map(|deletion| KvReaderU16::new(deletion).iter().take(MAX_COUNTED_WORDS + 1).count()) + // keep the count if under or equal to MAX_COUNTED_WORDS + .filter(|&word_count| word_count <= MAX_COUNTED_WORDS); + let addition = del_add_reader + // get added words + .get(DelAdd::Addition) + // count added words + .map(|addition| KvReaderU16::new(addition).iter().take(MAX_COUNTED_WORDS + 1).count()) + // keep the count if under or equal to MAX_COUNTED_WORDS + .filter(|&word_count| word_count <= MAX_COUNTED_WORDS); + + if deletion != addition { + // Insert deleted word count in sorter if exist. + if let Some(word_count) = deletion { + value_buffer.clear(); + let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); + value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); + key_buffer.clear(); + key_buffer.extend_from_slice(fid_bytes); + key_buffer.push(word_count as u8); + fid_word_count_docids_sorter + .insert(&key_buffer, value_writer.into_inner().unwrap())?; + } + // Insert added word count in sorter if exist. + if let Some(word_count) = addition { + value_buffer.clear(); + let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); + value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); + key_buffer.clear(); + key_buffer.extend_from_slice(fid_bytes); + key_buffer.push(word_count as u8); + fid_word_count_docids_sorter + .insert(&key_buffer, value_writer.into_inner().unwrap())?; + } } } From a3dae4db9beb546517160aab8d896f8f2d12d2a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 19 Oct 2023 15:55:48 +0200 Subject: [PATCH 036/127] Extract the geo fields DelAdd and generate a new DelAdd obkv with it --- .../extract/extract_geo_points.rs | 81 +++++++++++++------ 1 file changed, 58 insertions(+), 23 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index 285a4bdba..36be9b5b6 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -6,6 +6,7 @@ use serde_json::Value; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; use crate::error::GeoError; +use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::index_documents::extract_finite_float_from_value; use crate::{FieldId, InternalError, Result}; @@ -14,6 +15,7 @@ use crate::{FieldId, InternalError, Result}; /// Returns the generated grenad reader containing the docid as key associated to the (latitude, longitude) #[logging_timer::time] pub fn extract_geo_points( + // TODO grenad::Reader>> obkv_documents: grenad::Reader, indexer: GrenadParameters, primary_key_id: FieldId, @@ -30,39 +32,72 @@ pub fn extract_geo_points( let mut cursor = obkv_documents.into_cursor()?; while let Some((docid_bytes, value)) = cursor.move_on_next()? { let obkv = obkv::KvReader::new(value); - // since we only needs the primary key when we throw an error we create this getter to - // lazily get it when needed + // since we only need the primary key when we throw an error + // we create this getter to lazily get it when needed let document_id = || -> Value { let document_id = obkv.get(primary_key_id).unwrap(); serde_json::from_slice(document_id).unwrap() }; + // HELP we will receive two DelAdds here, one for the lat and one for the lng + // what happens if there is a missing Del or Add for one of them? + // first we get the two fields - let lat = obkv.get(lat_fid); - let lng = obkv.get(lng_fid); + match (obkv.get(lat_fid), obkv.get(lng_fid)) { + (Some(lat), Some(lng)) => { + let deladd_lat_obkv = KvReaderDelAdd::new(lat); + let deladd_lng_obkv = KvReaderDelAdd::new(lng); - if let Some((lat, lng)) = lat.zip(lng) { - // then we extract the values - let lat = extract_finite_float_from_value( - serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?, - ) - .map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?; + // then we extract the values + let del_lat_lng = deladd_lat_obkv + .get(DelAdd::Deletion) + .zip(deladd_lng_obkv.get(DelAdd::Deletion)) + .map(|(lat, lng)| extract_lat_lng(lat, lng, document_id)) + .transpose()?; + let add_lat_lng = deladd_lat_obkv + .get(DelAdd::Addition) + .zip(deladd_lng_obkv.get(DelAdd::Addition)) + .map(|(lat, lng)| extract_lat_lng(lat, lng, document_id)) + .transpose()?; - let lng = extract_finite_float_from_value( - serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?, - ) - .map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?; - - #[allow(clippy::drop_non_drop)] - let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; - writer.insert(docid_bytes, bytes)?; - } else if lat.is_none() && lng.is_some() { - return Err(GeoError::MissingLatitude { document_id: document_id() })?; - } else if lat.is_some() && lng.is_none() { - return Err(GeoError::MissingLongitude { document_id: document_id() })?; + let mut obkv = KvWriterDelAdd::memory(); + if let Some([lat, lng]) = del_lat_lng { + #[allow(clippy::drop_non_drop)] + let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; + obkv.insert(DelAdd::Deletion, bytes)?; + } + if let Some([lat, lng]) = add_lat_lng { + #[allow(clippy::drop_non_drop)] + let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; + obkv.insert(DelAdd::Addition, bytes)?; + } + let bytes = obkv.into_inner()?; + writer.insert(docid_bytes, bytes)?; + } + (None, Some(_)) => { + return Err(GeoError::MissingLatitude { document_id: document_id() }.into()) + } + (Some(_), None) => { + return Err(GeoError::MissingLongitude { document_id: document_id() }.into()) + } + (None, None) => (), } - // else => the _geo object was `null`, there is nothing to do } writer_into_reader(writer) } + +/// Extract the finite floats lat and lng from two bytes slices. +fn extract_lat_lng(lat: &[u8], lng: &[u8], document_id: impl Fn() -> Value) -> Result<[f64; 2]> { + let lat = extract_finite_float_from_value( + serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?, + ) + .map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?; + + let lng = extract_finite_float_from_value( + serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?, + ) + .map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?; + + Ok([lat, lng]) +} From 544440c363c843da5eb2832a30185480f28b5b07 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 23 Oct 2023 11:54:45 +0200 Subject: [PATCH 037/127] Ignore geo fields when the Del and Add content is the same --- .../extract/extract_geo_points.rs | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index 36be9b5b6..a818bb91c 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -60,19 +60,21 @@ pub fn extract_geo_points( .map(|(lat, lng)| extract_lat_lng(lat, lng, document_id)) .transpose()?; - let mut obkv = KvWriterDelAdd::memory(); - if let Some([lat, lng]) = del_lat_lng { - #[allow(clippy::drop_non_drop)] - let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; - obkv.insert(DelAdd::Deletion, bytes)?; + if del_lat_lng != add_lat_lng { + let mut obkv = KvWriterDelAdd::memory(); + if let Some([lat, lng]) = del_lat_lng { + #[allow(clippy::drop_non_drop)] + let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; + obkv.insert(DelAdd::Deletion, bytes)?; + } + if let Some([lat, lng]) = add_lat_lng { + #[allow(clippy::drop_non_drop)] + let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; + obkv.insert(DelAdd::Addition, bytes)?; + } + let bytes = obkv.into_inner()?; + writer.insert(docid_bytes, bytes)?; } - if let Some([lat, lng]) = add_lat_lng { - #[allow(clippy::drop_non_drop)] - let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; - obkv.insert(DelAdd::Addition, bytes)?; - } - let bytes = obkv.into_inner()?; - writer.insert(docid_bytes, bytes)?; } (None, Some(_)) => { return Err(GeoError::MissingLatitude { document_id: document_id() }.into()) From 77dcbff6b2355b6a72837c29cec4dc7c355dea22 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 23 Oct 2023 13:49:54 +0200 Subject: [PATCH 038/127] Remove and Insert the DelAdd geo points --- .../extract/extract_geo_points.rs | 1 - .../src/update/index_documents/typed_chunk.rs | 28 +++++++++++++------ 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index a818bb91c..cc283121e 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -15,7 +15,6 @@ use crate::{FieldId, InternalError, Result}; /// Returns the generated grenad reader containing the docid as key associated to the (latitude, longitude) #[logging_timer::time] pub fn extract_geo_points( - // TODO grenad::Reader>> obkv_documents: grenad::Reader, indexer: GrenadParameters, primary_key_id: FieldId, diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 0d618ad28..9d4d63f90 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -279,14 +279,17 @@ pub(crate) fn write_typed_chunk_into_index( // convert the key back to a u32 (4 bytes) let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); - // convert the latitude and longitude back to a f64 (8 bytes) - let (lat, tail) = helpers::try_split_array_at::(value).unwrap(); - let (lng, _) = helpers::try_split_array_at::(tail).unwrap(); - let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)]; - let xyz_point = lat_lng_to_xyz(&point); - - rtree.insert(GeoPoint::new(xyz_point, (docid, point))); - geo_faceted_docids.insert(docid); + let deladd_obkv = KvReaderDelAdd::new(value); + if let Some(value) = deladd_obkv.get(DelAdd::Deletion) { + let geopoint = extract_geo_point(value, docid); + rtree.remove(&geopoint); + geo_faceted_docids.remove(docid); + } + if let Some(value) = deladd_obkv.get(DelAdd::Addition) { + let geopoint = extract_geo_point(value, docid); + rtree.insert(geopoint); + geo_faceted_docids.insert(docid); + } } index.put_geo_rtree(wtxn, &rtree)?; index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?; @@ -368,6 +371,15 @@ pub(crate) fn write_typed_chunk_into_index( Ok((RoaringBitmap::new(), is_merged_database)) } +/// Converts the latitude and longitude back to an xyz GeoPoint. +fn extract_geo_point(value: &[u8], docid: DocumentId) -> GeoPoint { + let (lat, tail) = helpers::try_split_array_at::(value).unwrap(); + let (lng, _) = helpers::try_split_array_at::(tail).unwrap(); + let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)]; + let xyz_point = lat_lng_to_xyz(&point); + GeoPoint::new(xyz_point, (docid, point)) +} + fn merge_word_docids_reader_into_fst( word_docids_iter: grenad::Reader>, exact_word_docids_iter: grenad::Reader>, From 576fa9c6da0567e73b9598b5ca51f76a1bfd2c6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 24 Oct 2023 10:21:47 +0200 Subject: [PATCH 039/127] Remove useless comment --- milli/src/update/index_documents/extract/extract_geo_points.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index cc283121e..5ee7967d2 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -38,9 +38,6 @@ pub fn extract_geo_points( serde_json::from_slice(document_id).unwrap() }; - // HELP we will receive two DelAdds here, one for the lat and one for the lng - // what happens if there is a missing Del or Add for one of them? - // first we get the two fields match (obkv.get(lat_fid), obkv.get(lng_fid)) { (Some(lat), Some(lng)) => { From 476e4d3dbed3ccf91c3bb95249a557b92f035562 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 24 Oct 2023 10:19:32 +0200 Subject: [PATCH 040/127] Use value buffer instead of the initial value when writting the final result in the sorter --- .../extract/extract_docid_word_positions.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 36258b275..e5d95cbdb 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -115,6 +115,7 @@ pub fn extract_docid_word_positions( let (add_obkv, add_script_language_word_count) = add?; // merge deletions and additions. + // transforming two KV> into one KV>> value_buffer.clear(); del_add_from_two_obkvs( KvReader::::new(del_obkv), @@ -122,8 +123,8 @@ pub fn extract_docid_word_positions( &mut value_buffer, )?; - // write them into the sorter. - let obkv = KvReader::::new(value); + // write each KV> into the sorter, field by field. + let obkv = KvReader::::new(&value_buffer); for (field_id, value) in obkv.iter() { key_buffer.truncate(mem::size_of::()); key_buffer.extend_from_slice(&field_id.to_be_bytes()); @@ -151,6 +152,7 @@ pub fn extract_docid_word_positions( } } + // the returned sorter is serialized as: key: (DocId, FieldId), value: KV>. sorter_into_reader(docid_word_positions_sorter, indexer) .map(|reader| (documents_ids, reader, script_language_docids)) } @@ -266,6 +268,7 @@ fn lang_safe_tokens_from_document<'a>( } } + // returns a (KV>, HashMap>) Ok((&buffers.obkv_buffer, script_language_word_count)) } @@ -331,6 +334,7 @@ fn tokens_from_document<'a>( } } + // returns a KV> Ok(document_writer.into_inner().map(|v| v.as_slice())?) } From 696fcf4d185793f2ffaa2274dc45700128e06dd2 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 24 Oct 2023 11:03:35 +0200 Subject: [PATCH 041/127] Fix document insertion into LMDB --- .../src/update/index_documents/typed_chunk.rs | 21 ++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 9d4d63f90..6a2ea8486 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -8,6 +8,7 @@ use charabia::{Language, Script}; use grenad::MergerBuilder; use heed::types::ByteSlice; use heed::RwTxn; +use obkv::{KvReader, KvWriter}; use roaring::RoaringBitmap; use super::helpers::{self, merge_ignore_values, valid_lmdb_key, CursorClonableMmap}; @@ -19,7 +20,9 @@ use crate::index::Hnsw; use crate::update::del_add::{DelAdd, KvReaderDelAdd}; use crate::update::facet::FacetsUpdate; use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at}; -use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, BEU32}; +use crate::{ + lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, Result, BEU32, +}; pub(crate) enum TypedChunk { FieldIdDocidFacetStrings(grenad::Reader), @@ -120,8 +123,20 @@ pub(crate) fn write_typed_chunk_into_index( match typed_chunk { TypedChunk::Documents(obkv_documents_iter) => { let mut cursor = obkv_documents_iter.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { - index.documents.remap_types::().put(wtxn, key, value)?; + while let Some((docid, reader)) = cursor.move_on_next()? { + let mut writer: KvWriter<_, FieldId> = KvWriter::memory(); + let reader: KvReader = KvReader::new(reader); + for (field_id, value) in reader.iter() { + let Some(value) = KvReaderDelAdd::new(value).get(DelAdd::Addition) else { + continue; + }; + writer.insert(field_id, value)?; + } + index.documents.remap_types::().put( + wtxn, + docid, + &writer.into_inner().unwrap(), + )?; } } TypedChunk::FieldIdWordCountDocids(fid_word_count_docids_iter) => { From cda6ca1ee6880ebfaaf53a4c969e6a950b1d56c4 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 24 Oct 2023 14:26:14 +0200 Subject: [PATCH 042/127] Remove TypedChunk::NewDocumentIds --- milli/src/update/index_documents/extract/mod.rs | 3 --- milli/src/update/index_documents/typed_chunk.rs | 7 ------- 2 files changed, 10 deletions(-) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 7d643d61f..20ee38c4f 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -366,9 +366,6 @@ fn send_and_extract_flattened_documents_data( max_positions_per_attributes, )?; - // send documents_ids to DB writer - let _ = lmdb_writer_sx.send(Ok(TypedChunk::NewDocumentsIds(documents_ids))); - // send docid_word_positions_chunk to DB writer let docid_word_positions_chunk = unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? }; diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 6a2ea8486..aebfca151 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -29,7 +29,6 @@ pub(crate) enum TypedChunk { FieldIdDocidFacetNumbers(grenad::Reader), Documents(grenad::Reader), FieldIdWordCountDocids(grenad::Reader>), - NewDocumentsIds(RoaringBitmap), WordDocids { word_docids_reader: grenad::Reader>, exact_word_docids_reader: grenad::Reader>, @@ -62,9 +61,6 @@ impl TypedChunk { TypedChunk::FieldIdWordCountDocids(grenad) => { format!("FieldIdWordcountDocids {{ number_of_entries: {} }}", grenad.len()) } - TypedChunk::NewDocumentsIds(grenad) => { - format!("NewDocumentsIds {{ number_of_entries: {} }}", grenad.len()) - } TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader, @@ -150,9 +146,6 @@ pub(crate) fn write_typed_chunk_into_index( )?; is_merged_database = true; } - TypedChunk::NewDocumentsIds(documents_ids) => { - return Ok((documents_ids, is_merged_database)) - } TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader, From 946c762d289f4ca468f243226ca2a61f718599ec Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 24 Oct 2023 14:26:49 +0200 Subject: [PATCH 043/127] WIP: reset documents in TypedChunk::Documents --- milli/src/update/index_documents/mod.rs | 17 +++--------- .../src/update/index_documents/typed_chunk.rs | 26 +++++++++++++++---- 2 files changed, 25 insertions(+), 18 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 27021c3fb..d1fa28826 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -35,7 +35,7 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ - self, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, + DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst, }; use crate::{CboRoaringBitmapCodec, Index, Result}; @@ -374,17 +374,6 @@ where drop(lmdb_writer_sx) }); - // We delete the documents that this document addition replaces. This way we are - // able to simply insert all the documents even if they already exist in the database. - if !replaced_documents_ids.is_empty() { - let mut deletion_builder = update::DeleteDocuments::new(self.wtxn, self.index)?; - deletion_builder.strategy(self.config.deletion_strategy); - debug!("documents to delete {:?}", replaced_documents_ids); - deletion_builder.delete_documents(&replaced_documents_ids); - let deleted_documents_result = deletion_builder.execute_inner()?; - debug!("{} documents actually deleted", deleted_documents_result.deleted_documents); - } - let index_documents_ids = self.index.documents_ids(self.wtxn)?; let index_is_empty = index_documents_ids.is_empty(); let mut final_documents_ids = RoaringBitmap::new(); @@ -437,6 +426,7 @@ where otherwise => otherwise, }; + // FIXME: return newly added as well as newly deleted documents let (docids, is_merged_database) = write_typed_chunk_into_index(typed_chunk, self.index, self.wtxn, index_is_empty)?; if !docids.is_empty() { @@ -472,8 +462,9 @@ where let external_documents_ids = external_documents_ids.into_static(); self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?; + // FIXME: remove `new_documents_ids` entirely and `replaced_documents_ids` let all_documents_ids = index_documents_ids | new_documents_ids; - self.index.put_documents_ids(self.wtxn, &all_documents_ids)?; + //self.index.put_documents_ids(self.wtxn, &all_documents_ids)?; // TODO: reactivate prefix DB with diff-indexing // self.execute_prefix_databases( diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index aebfca151..39537cce7 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -118,22 +118,38 @@ pub(crate) fn write_typed_chunk_into_index( let mut is_merged_database = false; match typed_chunk { TypedChunk::Documents(obkv_documents_iter) => { + let mut docids = index.documents_ids(wtxn)?; + let mut cursor = obkv_documents_iter.into_cursor()?; while let Some((docid, reader)) = cursor.move_on_next()? { let mut writer: KvWriter<_, FieldId> = KvWriter::memory(); let reader: KvReader = KvReader::new(reader); + let mut written = false; for (field_id, value) in reader.iter() { let Some(value) = KvReaderDelAdd::new(value).get(DelAdd::Addition) else { continue; }; + // TODO: writer.is_empty + written = true; writer.insert(field_id, value)?; } - index.documents.remap_types::().put( - wtxn, - docid, - &writer.into_inner().unwrap(), - )?; + + let db = index.documents.remap_data_type::(); + let docid = docid.try_into().map(DocumentId::from_be_bytes).unwrap(); + + if written { + db.put(wtxn, &BEU32::new(docid), &writer.into_inner().unwrap())?; + docids.insert(docid); + } else { + db.delete(wtxn, &BEU32::new(docid))?; + // FIXME: unwrap + if !docids.remove(docid) { + panic!("Attempt to remove a document id that doesn't exist") + } + } } + + index.put_documents_ids(wtxn, &docids)?; } TypedChunk::FieldIdWordCountDocids(fid_word_count_docids_iter) => { append_entries_into_database( From 5be569e3e2799721a83df877921fde972848e933 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 24 Oct 2023 17:01:30 +0200 Subject: [PATCH 044/127] Update obkv --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d8cd12cc2..2ab2f706a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2866,9 +2866,9 @@ dependencies = [ [[package]] name = "obkv" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f69e48cd7c8e5bb52a1da1287fdbfd877c32673176583ce664cd63b201aba385" +checksum = "6c459142426056c639ff88d053ebaaaeca0ee1411c94362892398ef4ccd81080" [[package]] name = "once_cell" From 8fb221dae36ed73475a83fb511775f9b8729e36d Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 24 Oct 2023 17:01:45 +0200 Subject: [PATCH 045/127] Refactor ExternalDocumentsIds - Remove soft deleted - Add apply method that takes a list of operations to encapsulate modifications to the external -> internal mapping --- milli/src/external_documents_ids.rs | 189 +++++++++++++--------------- 1 file changed, 85 insertions(+), 104 deletions(-) diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs index 36b147336..cd6a7e729 100644 --- a/milli/src/external_documents_ids.rs +++ b/milli/src/external_documents_ids.rs @@ -7,133 +7,118 @@ use fst::map::IndexedValue; use fst::{IntoStreamer, Streamer}; use roaring::RoaringBitmap; +use crate::DocumentId; + const DELETED_ID: u64 = u64::MAX; -pub struct ExternalDocumentsIds<'a> { - pub(crate) hard: fst::Map>, - pub(crate) soft: fst::Map>, - soft_deleted_docids: RoaringBitmap, +pub enum DocumentOperationKind { + Create, + Delete, } +pub struct DocumentOperation { + pub external_id: String, + pub internal_id: DocumentId, + pub kind: DocumentOperationKind, +} + +pub struct ExternalDocumentsIds<'a>(fst::Map>); + impl<'a> ExternalDocumentsIds<'a> { - pub fn new( - hard: fst::Map>, - soft: fst::Map>, - soft_deleted_docids: RoaringBitmap, - ) -> ExternalDocumentsIds<'a> { - ExternalDocumentsIds { hard, soft, soft_deleted_docids } + pub fn new(fst: fst::Map>) -> ExternalDocumentsIds<'a> { + ExternalDocumentsIds(fst) } pub fn into_static(self) -> ExternalDocumentsIds<'static> { - ExternalDocumentsIds { - hard: self.hard.map_data(|c| Cow::Owned(c.into_owned())).unwrap(), - soft: self.soft.map_data(|c| Cow::Owned(c.into_owned())).unwrap(), - soft_deleted_docids: self.soft_deleted_docids, - } + ExternalDocumentsIds(self.0.map_data(|c| Cow::Owned(c.into_owned())).unwrap()) } /// Returns `true` if hard and soft external documents lists are empty. pub fn is_empty(&self) -> bool { - self.hard.is_empty() && self.soft.is_empty() + self.0.is_empty() } pub fn get>(&self, external_id: A) -> Option { let external_id = external_id.as_ref(); - match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) { - Some(id) if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) => { - Some(id.try_into().unwrap()) - } - _otherwise => None, - } - } - - /// Rebuild the internal FSTs in the ExternalDocumentsIds structure such that they - /// don't contain any soft deleted document id. - pub fn delete_soft_deleted_documents_ids_from_fsts(&mut self) -> fst::Result<()> { - let mut new_hard_builder = fst::MapBuilder::memory(); - - let union_op = self.hard.op().add(&self.soft).r#union(); - let mut iter = union_op.into_stream(); - while let Some((external_id, docids)) = iter.next() { - // prefer selecting the ids from soft, always - let id = indexed_last_value(docids).unwrap(); - if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) { - new_hard_builder.insert(external_id, id)?; - } - } - drop(iter); - - // Delete soft map completely - self.soft = fst::Map::default().map_data(Cow::Owned)?; - // We save the new map as the new hard map. - self.hard = new_hard_builder.into_map().map_data(Cow::Owned)?; - - Ok(()) - } - - pub fn insert_ids>(&mut self, other: &fst::Map) -> fst::Result<()> { - let union_op = self.soft.op().add(other).r#union(); - - let mut new_soft_builder = fst::MapBuilder::memory(); - let mut iter = union_op.into_stream(); - while let Some((external_id, marked_docids)) = iter.next() { - let id = indexed_last_value(marked_docids).unwrap(); - new_soft_builder.insert(external_id, id)?; - } - - drop(iter); - - // We save the new map as the new soft map. - self.soft = new_soft_builder.into_map().map_data(Cow::Owned)?; - self.merge_soft_into_hard() + self.0.get(external_id).map(|x| x.try_into().unwrap()) } /// An helper function to debug this type, returns an `HashMap` of both, /// soft and hard fst maps, combined. pub fn to_hash_map(&self) -> HashMap { - let mut map = HashMap::new(); - - let union_op = self.hard.op().add(&self.soft).r#union(); - let mut iter = union_op.into_stream(); - while let Some((external_id, marked_docids)) = iter.next() { - let id = indexed_last_value(marked_docids).unwrap(); - if id != DELETED_ID { - let external_id = str::from_utf8(external_id).unwrap(); - map.insert(external_id.to_owned(), id.try_into().unwrap()); - } + let mut map = HashMap::default(); + let mut stream = self.0.stream(); + while let Some((k, v)) = stream.next() { + let k = String::from_utf8(k.to_vec()).unwrap(); + map.insert(k, v.try_into().unwrap()); } - map } - /// Return an fst of the combined hard and soft deleted ID. - pub fn to_fst<'b>(&'b self) -> fst::Result>>> { - if self.soft.is_empty() { - return Ok(Cow::Borrowed(&self.hard)); - } - let union_op = self.hard.op().add(&self.soft).r#union(); - - let mut iter = union_op.into_stream(); - let mut new_hard_builder = fst::MapBuilder::memory(); - while let Some((external_id, marked_docids)) = iter.next() { - let value = indexed_last_value(marked_docids).unwrap(); - if value != DELETED_ID { - new_hard_builder.insert(external_id, value)?; - } - } - - drop(iter); - - Ok(Cow::Owned(new_hard_builder.into_map().map_data(Cow::Owned)?)) + pub fn as_bytes(&self) -> &[u8] { + self.0.as_fst().as_bytes() } - fn merge_soft_into_hard(&mut self) -> fst::Result<()> { - if self.soft.len() >= self.hard.len() / 2 { - self.hard = self.to_fst()?.into_owned(); - self.soft = fst::Map::default().map_data(Cow::Owned)?; - } + /// Apply the list of operations passed as argument, modifying the current external to internal id mapping. + /// + /// If the list contains multiple operations on the same external id, then the result is unspecified. + /// + /// # Panics + /// + /// - If attempting to delete a document that doesn't exist + /// - If attempting to create a document that already exists + pub fn apply(&mut self, mut operations: Vec) { + operations.sort_unstable_by(|left, right| left.external_id.cmp(&right.external_id)); + operations.dedup_by(|left, right| left.external_id == right.external_id); - Ok(()) + let mut builder = fst::MapBuilder::memory(); + + let mut stream = self.0.stream(); + let mut next_stream = stream.next(); + let mut operations = operations.iter(); + let mut next_operation = operations.next(); + + loop { + (next_stream, next_operation) = match (next_stream.take(), next_operation.take()) { + (None, None) => break, + (None, Some(DocumentOperation { external_id, internal_id, kind })) => { + if matches!(kind, DocumentOperationKind::Delete) { + panic!("Attempting to delete a non-existing document") + } + builder.insert(external_id, (*internal_id).into()).unwrap(); + (None, operations.next()) + } + (Some((k, v)), None) => { + builder.insert(k, v).unwrap(); + (stream.next(), None) + } + ( + current_stream @ Some((left_external_id, left_internal_id)), + current_operation @ Some(DocumentOperation { + external_id: right_external_id, + internal_id: right_internal_id, + kind, + }), + ) => match left_external_id.cmp(right_external_id.as_bytes()) { + std::cmp::Ordering::Less => { + builder.insert(left_external_id, left_internal_id).unwrap(); + (stream.next(), current_operation) + } + std::cmp::Ordering::Greater => { + builder.insert(right_external_id, (*right_internal_id).into()).unwrap(); + (current_stream, operations.next()) + } + std::cmp::Ordering::Equal => { + if matches!(kind, DocumentOperationKind::Create) { + panic!("Attempting to create an already-existing document"); + } + // we delete the document, so we just advance both iterators to skip in stream + (stream.next(), operations.next()) + } + }, + } + } + self.0 = builder.into_map().map_data(Cow::Owned).unwrap(); } } @@ -145,11 +130,7 @@ impl fmt::Debug for ExternalDocumentsIds<'_> { impl Default for ExternalDocumentsIds<'static> { fn default() -> Self { - ExternalDocumentsIds { - hard: fst::Map::default().map_data(Cow::Owned).unwrap(), - soft: fst::Map::default().map_data(Cow::Owned).unwrap(), - soft_deleted_docids: RoaringBitmap::new(), - } + ExternalDocumentsIds(fst::Map::default().map_data(Cow::Owned).unwrap()) } } From bafeb892a770fc6d5482044610705ce65b9174bf Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 24 Oct 2023 17:02:55 +0200 Subject: [PATCH 046/127] Modify Index after changes to ExternalDocumentsIds --- milli/src/index.rs | 31 +++++++------------------------ 1 file changed, 7 insertions(+), 24 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index eb9e153ec..61ec41788 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -52,11 +52,10 @@ pub mod main_key { /// It is concatenated with a big-endian encoded number (non-human readable). /// e.g. vector-hnsw0x0032. pub const VECTOR_HNSW_KEY_PREFIX: &str = "vector-hnsw"; - pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; + pub const EXTERNAL_DOCUMENTS_IDS_KEY: &str = "external-documents-ids"; pub const PRIMARY_KEY_KEY: &str = "primary-key"; pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields"; pub const USER_DEFINED_SEARCHABLE_FIELDS_KEY: &str = "user-defined-searchable-fields"; - pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids"; pub const STOP_WORDS_KEY: &str = "stop-words"; pub const NON_SEPARATOR_TOKENS_KEY: &str = "non-separator-tokens"; pub const SEPARATOR_TOKENS_KEY: &str = "separator-tokens"; @@ -417,18 +416,10 @@ impl Index { wtxn: &mut RwTxn, external_documents_ids: &ExternalDocumentsIds<'_>, ) -> heed::Result<()> { - let ExternalDocumentsIds { hard, soft, .. } = external_documents_ids; - let hard = hard.as_fst().as_bytes(); - let soft = soft.as_fst().as_bytes(); self.main.put::<_, Str, ByteSlice>( wtxn, - main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY, - hard, - )?; - self.main.put::<_, Str, ByteSlice>( - wtxn, - main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY, - soft, + main_key::EXTERNAL_DOCUMENTS_IDS_KEY, + external_documents_ids.as_bytes(), )?; Ok(()) } @@ -436,20 +427,12 @@ impl Index { /// Returns the external documents ids map which associate the external ids /// with the internal ids (i.e. `u32`). pub fn external_documents_ids<'t>(&self, rtxn: &'t RoTxn) -> Result> { - let hard = - self.main.get::<_, Str, ByteSlice>(rtxn, main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY)?; - let soft = - self.main.get::<_, Str, ByteSlice>(rtxn, main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY)?; - let hard = match hard { - Some(hard) => fst::Map::new(hard)?.map_data(Cow::Borrowed)?, + let fst = self.main.get::<_, Str, ByteSlice>(rtxn, main_key::EXTERNAL_DOCUMENTS_IDS_KEY)?; + let fst = match fst { + Some(fst) => fst::Map::new(fst)?.map_data(Cow::Borrowed)?, None => fst::Map::default().map_data(Cow::Owned)?, }; - let soft = match soft { - Some(soft) => fst::Map::new(soft)?.map_data(Cow::Borrowed)?, - None => fst::Map::default().map_data(Cow::Owned)?, - }; - let soft_deleted_docids = self.soft_deleted_documents_ids(rtxn)?; - Ok(ExternalDocumentsIds::new(hard, soft, soft_deleted_docids)) + Ok(ExternalDocumentsIds::new(fst)) } /* fields ids map */ From c6b3c18c85e234929f517a2dbfd7dfcd01e71c36 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 24 Oct 2023 17:04:02 +0200 Subject: [PATCH 047/127] WIP: Comment out document deletion in other pipelines than update TODO: fix calls to DELETE route --- milli/src/update/delete_documents.rs | 10 ++++++---- milli/src/update/index_documents/mod.rs | 8 ++++---- milli/src/update/index_documents/transform.rs | 12 +++--------- 3 files changed, 13 insertions(+), 17 deletions(-) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 9044f03be..0299e1e4f 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -255,12 +255,14 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } // We acquire the current external documents ids map... // Note that its soft-deleted document ids field will be equal to the `to_delete_docids` - let mut new_external_documents_ids = self.index.external_documents_ids(self.wtxn)?; + //let mut new_external_documents_ids = self.index.external_documents_ids(self.wtxn)?; // We then remove the soft-deleted docids from it - new_external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?; + //new_external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?; // and write it back to the main database. - let new_external_documents_ids = new_external_documents_ids.into_static(); - self.index.put_external_documents_ids(self.wtxn, &new_external_documents_ids)?; + //let new_external_documents_ids = new_external_documents_ids.into_static(); + //self.index.put_external_documents_ids(self.wtxn, &new_external_documents_ids)?; + + todo!("please autobatch deletions for now"); let mut words_to_keep = BTreeSet::default(); let mut words_to_delete = BTreeSet::default(); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index d1fa28826..8d187a89d 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -457,10 +457,10 @@ where self.index.put_primary_key(self.wtxn, &primary_key)?; // We write the external documents ids into the main database. - let mut external_documents_ids = self.index.external_documents_ids(self.wtxn)?; - external_documents_ids.insert_ids(&new_external_documents_ids)?; - let external_documents_ids = external_documents_ids.into_static(); - self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?; + //let mut external_documents_ids = self.index.external_documents_ids(self.wtxn)?; + //external_documents_ids.insert_ids(&new_external_documents_ids)?; + //let external_documents_ids = external_documents_ids.into_static(); + //self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?; // FIXME: remove `new_documents_ids` entirely and `replaced_documents_ids` let all_documents_ids = index_documents_ids | new_documents_ids; diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 2b77768cb..e02da8cb5 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -763,14 +763,6 @@ impl<'a, 'i> Transform<'a, 'i> { .to_string(); let field_distribution = self.index.field_distribution(wtxn)?; - // Delete the soft deleted document ids from the maps inside the external_document_ids structure - let new_external_documents_ids = { - let mut external_documents_ids = self.index.external_documents_ids(wtxn)?; - external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?; - // This call should be free and can't fail since the previous method merged both fsts. - external_documents_ids.into_static().to_fst()?.into_owned() - }; - let documents_ids = self.index.documents_ids(wtxn)?; let documents_count = documents_ids.len() as usize; @@ -858,8 +850,10 @@ impl<'a, 'i> Transform<'a, 'i> { primary_key, fields_ids_map: new_fields_ids_map, field_distribution, - new_external_documents_ids, + // FIXME: remove this now unused field + new_external_documents_ids: fst::Map::default().map_data(Cow::Owned).unwrap(), new_documents_ids: documents_ids, + // FIXME: remove this now unused field replaced_documents_ids: RoaringBitmap::default(), documents_count, original_documents: original_documents.into_inner().map_err(|err| err.into_error())?, From 85f42fbc036e850cf55b044eb948de72abbf5ebe Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 24 Oct 2023 17:04:48 +0200 Subject: [PATCH 048/127] Handle external to internal id mapping from TypedChunk::Documents --- .../src/update/index_documents/typed_chunk.rs | 81 +++++++++++++++---- 1 file changed, 65 insertions(+), 16 deletions(-) diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 39537cce7..1f1ac4adf 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -15,13 +15,16 @@ use super::helpers::{self, merge_ignore_values, valid_lmdb_key, CursorClonableMm use super::{ClonableMmap, MergeFn}; use crate::distance::NDotProductPoint; use crate::error::UserError; +use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind}; use crate::facet::FacetType; use crate::index::Hnsw; use crate::update::del_add::{DelAdd, KvReaderDelAdd}; use crate::update::facet::FacetsUpdate; use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at}; +use crate::update::index_documents::validate_document_id_value; use crate::{ - lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, Result, BEU32, + lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, InternalError, + Result, BEU32, }; pub(crate) enum TypedChunk { @@ -118,36 +121,82 @@ pub(crate) fn write_typed_chunk_into_index( let mut is_merged_database = false; match typed_chunk { TypedChunk::Documents(obkv_documents_iter) => { - let mut docids = index.documents_ids(wtxn)?; + let mut operations: Vec = Default::default(); + let mut docids = index.documents_ids(wtxn)?; + let primary_key = index.primary_key(wtxn)?.unwrap(); + let primary_key = index.fields_ids_map(wtxn)?.id(primary_key).unwrap(); let mut cursor = obkv_documents_iter.into_cursor()?; while let Some((docid, reader)) = cursor.move_on_next()? { let mut writer: KvWriter<_, FieldId> = KvWriter::memory(); let reader: KvReader = KvReader::new(reader); - let mut written = false; + let docid = docid.try_into().map(DocumentId::from_be_bytes).unwrap(); + for (field_id, value) in reader.iter() { - let Some(value) = KvReaderDelAdd::new(value).get(DelAdd::Addition) else { - continue; - }; - // TODO: writer.is_empty - written = true; - writer.insert(field_id, value)?; + let del_add_reader = KvReaderDelAdd::new(value); + match ( + del_add_reader.get(DelAdd::Deletion), + del_add_reader.get(DelAdd::Addition), + ) { + (None, None) => {} + (None, Some(value)) => { + // if primary key, new document + if field_id == primary_key { + // FIXME: we already extracted the external docid before. We should retrieve it in the typed chunk + // rather than re-extract it here + // FIXME: unwraps + let document_id = serde_json::from_slice(value) + .map_err(InternalError::SerdeJson) + .unwrap(); + let external_id = + validate_document_id_value(document_id).unwrap().unwrap(); + operations.push(DocumentOperation { + external_id, + internal_id: docid, + kind: DocumentOperationKind::Create, + }); + docids.insert(docid); + } + // anyway, write + writer.insert(field_id, value)?; + } + (Some(value), None) => { + // if primary key, deleted document + if field_id == primary_key { + // FIXME: we already extracted the external docid before. We should retrieve it in the typed chunk + // rather than re-extract it here + // FIXME: unwraps + let document_id = serde_json::from_slice(value) + .map_err(InternalError::SerdeJson) + .unwrap(); + let external_id = + validate_document_id_value(document_id).unwrap().unwrap(); + operations.push(DocumentOperation { + external_id, + internal_id: docid, + kind: DocumentOperationKind::Delete, + }); + docids.remove(docid); + } + } + (Some(_), Some(value)) => { + // updated field, write + writer.insert(field_id, value)?; + } + } } let db = index.documents.remap_data_type::(); - let docid = docid.try_into().map(DocumentId::from_be_bytes).unwrap(); - if written { + if !writer.is_empty() { db.put(wtxn, &BEU32::new(docid), &writer.into_inner().unwrap())?; - docids.insert(docid); } else { db.delete(wtxn, &BEU32::new(docid))?; - // FIXME: unwrap - if !docids.remove(docid) { - panic!("Attempt to remove a document id that doesn't exist") - } } } + let mut external_documents_docids = index.external_documents_ids(wtxn)?.into_static(); + external_documents_docids.apply(operations); + index.put_external_documents_ids(wtxn, &external_documents_docids)?; index.put_documents_ids(wtxn, &docids)?; } From 8370fbc92b488fc026e27907eddc66c9d1edc63f Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 25 Oct 2023 11:20:01 +0200 Subject: [PATCH 049/127] Fix snaps --- milli/src/snapshot_tests.rs | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index 4b21cc175..77d9f41ec 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -340,20 +340,12 @@ pub fn snap_geo_faceted_documents_ids(index: &Index) -> String { } pub fn snap_external_documents_ids(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); - let ExternalDocumentsIds { soft, hard, .. } = index.external_documents_ids(&rtxn).unwrap(); + let external_ids = index.external_documents_ids(&rtxn).unwrap().to_hash_map(); let mut snap = String::new(); - writeln!(&mut snap, "soft:").unwrap(); - let stream_soft = soft.stream(); - let soft_external_ids = stream_soft.into_str_vec().unwrap(); - for (key, id) in soft_external_ids { - writeln!(&mut snap, "{key:<24} {id}").unwrap(); - } - writeln!(&mut snap, "hard:").unwrap(); - let stream_hard = hard.stream(); - let hard_external_ids = stream_hard.into_str_vec().unwrap(); - for (key, id) in hard_external_ids { + writeln!(&mut snap, "docids:").unwrap(); + for (key, id) in external_ids { writeln!(&mut snap, "{key:<24} {id}").unwrap(); } From 073f89db790c66483733d7a00695f692c55a457b Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 25 Oct 2023 11:22:30 +0200 Subject: [PATCH 050/127] Fix facet tests --- milli/src/update/facet/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 3465e5437..2b671e5cb 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -468,7 +468,7 @@ pub(crate) mod test_helpers { min_level_size: self.min_level_size.get(), }; - update.update(wtxn, field_ids, |_, _, _| Ok(())).unwrap(); + update.update(wtxn, field_ids).unwrap(); } pub fn verify_structure_validity(&self, txn: &RoTxn, field_id: u16) { From 01d5eedf2f86958c4f70c35ed2f4ed8be8fa916b Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 25 Oct 2023 13:37:42 +0200 Subject: [PATCH 051/127] Remove some warnings --- milli/src/external_documents_ids.rs | 13 ++-------- milli/src/update/index_documents/mod.rs | 32 +++++-------------------- 2 files changed, 8 insertions(+), 37 deletions(-) diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs index cd6a7e729..12db4eb1d 100644 --- a/milli/src/external_documents_ids.rs +++ b/milli/src/external_documents_ids.rs @@ -1,16 +1,12 @@ use std::borrow::Cow; use std::collections::HashMap; use std::convert::TryInto; -use std::{fmt, str}; +use std::fmt; -use fst::map::IndexedValue; -use fst::{IntoStreamer, Streamer}; -use roaring::RoaringBitmap; +use fst::Streamer; use crate::DocumentId; -const DELETED_ID: u64 = u64::MAX; - pub enum DocumentOperationKind { Create, Delete, @@ -133,8 +129,3 @@ impl Default for ExternalDocumentsIds<'static> { ExternalDocumentsIds(fst::Map::default().map_data(Cow::Owned).unwrap()) } } - -/// Returns the value of the `IndexedValue` with the highest _index_. -fn indexed_last_value(indexed_values: &[IndexedValue]) -> Option { - indexed_values.iter().copied().max_by_key(|iv| iv.index).map(|iv| iv.value) -} diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 8d187a89d..7a77f3a96 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -377,11 +377,6 @@ where let index_documents_ids = self.index.documents_ids(self.wtxn)?; let index_is_empty = index_documents_ids.is_empty(); let mut final_documents_ids = RoaringBitmap::new(); - let mut word_pair_proximity_docids = None; - let mut word_position_docids = None; - let mut word_fid_docids = None; - let mut word_docids = None; - let mut exact_word_docids = None; let mut databases_seen = 0; (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { @@ -399,30 +394,15 @@ where word_docids_reader, exact_word_docids_reader, word_fid_docids_reader, - } => { - let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? }; - word_docids = Some(cloneable_chunk); - let cloneable_chunk = - unsafe { as_cloneable_grenad(&exact_word_docids_reader)? }; - exact_word_docids = Some(cloneable_chunk); - let cloneable_chunk = unsafe { as_cloneable_grenad(&word_fid_docids_reader)? }; - word_fid_docids = Some(cloneable_chunk); - TypedChunk::WordDocids { - word_docids_reader, - exact_word_docids_reader, - word_fid_docids_reader, - } - } + } => TypedChunk::WordDocids { + word_docids_reader, + exact_word_docids_reader, + word_fid_docids_reader, + }, TypedChunk::WordPairProximityDocids(chunk) => { - let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; - word_pair_proximity_docids = Some(cloneable_chunk); TypedChunk::WordPairProximityDocids(chunk) } - TypedChunk::WordPositionDocids(chunk) => { - let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; - word_position_docids = Some(cloneable_chunk); - TypedChunk::WordPositionDocids(chunk) - } + TypedChunk::WordPositionDocids(chunk) => TypedChunk::WordPositionDocids(chunk), otherwise => otherwise, }; From 762b0b47e6275ac50f5903a923afb4c0ee7d63d9 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 25 Oct 2023 14:15:06 +0200 Subject: [PATCH 052/127] Use deladd merging function in chunks mergers --- .../src/update/index_documents/extract/mod.rs | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 20ee38c4f..41722a53e 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -28,8 +28,8 @@ use self::extract_word_docids::extract_word_docids; use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; use self::extract_word_position_docids::extract_word_position_docids; use super::helpers::{ - as_cloneable_grenad, merge_cbo_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, - MergeableReader, + as_cloneable_grenad, merge_deladd_cbo_roaring_bitmaps, CursorClonableMmap, GrenadParameters, + MergeFn, MergeableReader, }; use super::{helpers, TypedChunk}; use crate::{FieldId, Result}; @@ -108,7 +108,7 @@ pub(crate) fn data_from_obkv_documents( let lmdb_writer_sx = lmdb_writer_sx.clone(); rayon::spawn(move || { debug!("merge {} database", "facet-id-exists-docids"); - match facet_exists_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) { + match facet_exists_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) { Ok(reader) => { let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(reader))); } @@ -124,7 +124,7 @@ pub(crate) fn data_from_obkv_documents( let lmdb_writer_sx = lmdb_writer_sx.clone(); rayon::spawn(move || { debug!("merge {} database", "facet-id-is-null-docids"); - match facet_is_null_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) { + match facet_is_null_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) { Ok(reader) => { let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(reader))); } @@ -140,7 +140,7 @@ pub(crate) fn data_from_obkv_documents( let lmdb_writer_sx = lmdb_writer_sx.clone(); rayon::spawn(move || { debug!("merge {} database", "facet-id-is-empty-docids"); - match facet_is_empty_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) { + match facet_is_empty_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) { Ok(reader) => { let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(reader))); } @@ -156,7 +156,7 @@ pub(crate) fn data_from_obkv_documents( indexer, lmdb_writer_sx.clone(), extract_word_pair_proximity_docids, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, TypedChunk::WordPairProximityDocids, "word-pair-proximity-docids", ); @@ -166,7 +166,7 @@ pub(crate) fn data_from_obkv_documents( indexer, lmdb_writer_sx.clone(), extract_fid_word_count_docids, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, TypedChunk::FieldIdWordCountDocids, "field-id-wordcount-docids", ); @@ -184,7 +184,7 @@ pub(crate) fn data_from_obkv_documents( indexer, lmdb_writer_sx.clone(), move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes), - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, |(word_docids_reader, exact_word_docids_reader, word_fid_docids_reader)| { TypedChunk::WordDocids { word_docids_reader, @@ -200,7 +200,7 @@ pub(crate) fn data_from_obkv_documents( indexer, lmdb_writer_sx.clone(), extract_word_position_docids, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, TypedChunk::WordPositionDocids, "word-position-docids", ); @@ -210,7 +210,7 @@ pub(crate) fn data_from_obkv_documents( indexer, lmdb_writer_sx.clone(), extract_facet_string_docids, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, TypedChunk::FieldIdFacetStringDocids, "field-id-facet-string-docids", ); @@ -220,7 +220,7 @@ pub(crate) fn data_from_obkv_documents( indexer, lmdb_writer_sx, extract_facet_number_docids, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, TypedChunk::FieldIdFacetNumberDocids, "field-id-facet-number-docids", ); From d651b3ef01f69c9365ccf87a49de3c5f435c01f3 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 25 Oct 2023 13:38:56 +0200 Subject: [PATCH 053/127] Remove delete documents files --- milli/src/update/delete_documents.rs | 1249 -------------------------- milli/src/update/facet/delete.rs | 349 ------- milli/src/update/facet/mod.rs | 1 - milli/src/update/mod.rs | 1 - 4 files changed, 1600 deletions(-) delete mode 100644 milli/src/update/delete_documents.rs delete mode 100644 milli/src/update/facet/delete.rs diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs deleted file mode 100644 index 0299e1e4f..000000000 --- a/milli/src/update/delete_documents.rs +++ /dev/null @@ -1,1249 +0,0 @@ -use std::collections::btree_map::Entry; -use std::collections::{BTreeSet, HashMap, HashSet}; - -use fst::IntoStreamer; -use heed::types::{ByteSlice, DecodeIgnore, Str, UnalignedSlice}; -use heed::{BytesDecode, BytesEncode, Database, RwIter}; -use instant_distance::PointId; -use roaring::RoaringBitmap; -use serde::{Deserialize, Serialize}; -use time::OffsetDateTime; - -use super::facet::delete::FacetsDelete; -use super::ClearDocuments; -use crate::error::InternalError; -use crate::facet::FacetType; -use crate::heed_codec::facet::FieldDocIdFacetCodec; -use crate::heed_codec::CboRoaringBitmapCodec; -use crate::index::Hnsw; -use crate::{ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, BEU32}; - -pub struct DeleteDocuments<'t, 'u, 'i> { - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - external_documents_ids: ExternalDocumentsIds<'static>, - to_delete_docids: RoaringBitmap, - strategy: DeletionStrategy, -} - -/// Result of a [`DeleteDocuments`] operation. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct DocumentDeletionResult { - pub deleted_documents: u64, - pub remaining_documents: u64, -} - -/// Strategy for deleting documents. -/// -/// - Soft-deleted documents are simply marked as deleted without being actually removed from DB. -/// - Hard-deleted documents are definitely suppressed from the DB. -/// -/// Soft-deleted documents trade disk space for runtime performance. -/// -/// Note that any of these variants can be used at any given moment for any indexation in a database. -/// For instance, you can use an [`AlwaysSoft`] followed by an [`AlwaysHard`] option without issue. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)] -pub enum DeletionStrategy { - #[default] - /// Definitely suppress documents according to the number or size of soft-deleted documents - Dynamic, - /// Never definitely suppress documents - AlwaysSoft, - /// Always definitely suppress documents - AlwaysHard, -} - -impl std::fmt::Display for DeletionStrategy { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - DeletionStrategy::Dynamic => write!(f, "dynamic"), - DeletionStrategy::AlwaysSoft => write!(f, "always_soft"), - DeletionStrategy::AlwaysHard => write!(f, "always_hard"), - } - } -} - -/// Result of a [`DeleteDocuments`] operation, used for internal purposes. -/// -/// It is a superset of the [`DocumentDeletionResult`] structure, giving -/// additional information about the algorithm used to delete the documents. -#[derive(Debug)] -pub(crate) struct DetailedDocumentDeletionResult { - pub deleted_documents: u64, - pub remaining_documents: u64, -} - -impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { - pub fn new( - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - ) -> Result> { - let external_documents_ids = index.external_documents_ids(wtxn)?.into_static(); - - Ok(DeleteDocuments { - wtxn, - index, - external_documents_ids, - to_delete_docids: RoaringBitmap::new(), - strategy: Default::default(), - }) - } - - pub fn strategy(&mut self, strategy: DeletionStrategy) { - self.strategy = strategy; - } - - pub fn delete_document(&mut self, docid: u32) { - self.to_delete_docids.insert(docid); - } - - pub fn delete_documents(&mut self, docids: &RoaringBitmap) { - self.to_delete_docids |= docids; - } - - pub fn delete_external_id(&mut self, external_id: &str) -> Option { - let docid = self.external_documents_ids.get(external_id)?; - self.delete_document(docid); - Some(docid) - } - - pub fn execute(self) -> Result { - let DetailedDocumentDeletionResult { deleted_documents, remaining_documents } = - self.execute_inner()?; - - Ok(DocumentDeletionResult { deleted_documents, remaining_documents }) - } - - pub(crate) fn execute_inner(mut self) -> Result { - puffin::profile_function!(); - - self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; - - // We retrieve the current documents ids that are in the database. - let mut documents_ids = self.index.documents_ids(self.wtxn)?; - let mut soft_deleted_docids = self.index.soft_deleted_documents_ids(self.wtxn)?; - let current_documents_ids_len = documents_ids.len(); - - // We can and must stop removing documents in a database that is empty. - if documents_ids.is_empty() { - // but if there was still documents to delete we clear the database entirely - if !soft_deleted_docids.is_empty() { - ClearDocuments::new(self.wtxn, self.index).execute()?; - } - return Ok(DetailedDocumentDeletionResult { - deleted_documents: 0, - remaining_documents: 0, - }); - } - - // We remove the documents ids that we want to delete - // from the documents in the database and write them back. - documents_ids -= &self.to_delete_docids; - self.index.put_documents_ids(self.wtxn, &documents_ids)?; - - // We can execute a ClearDocuments operation when the number of documents - // to delete is exactly the number of documents in the database. - if current_documents_ids_len == self.to_delete_docids.len() { - let remaining_documents = ClearDocuments::new(self.wtxn, self.index).execute()?; - return Ok(DetailedDocumentDeletionResult { - deleted_documents: current_documents_ids_len, - remaining_documents, - }); - } - - let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; - let mut field_distribution = self.index.field_distribution(self.wtxn)?; - - // we update the field distribution - for docid in self.to_delete_docids.iter() { - let key = BEU32::new(docid); - let document = - self.index.documents.get(self.wtxn, &key)?.ok_or( - InternalError::DatabaseMissingEntry { db_name: "documents", key: None }, - )?; - for (fid, _value) in document.iter() { - let field_name = - fields_ids_map.name(fid).ok_or(FieldIdMapMissingEntry::FieldId { - field_id: fid, - process: "delete documents", - })?; - if let Entry::Occupied(mut entry) = field_distribution.entry(field_name.to_string()) - { - match entry.get().checked_sub(1) { - Some(0) | None => entry.remove(), - Some(count) => entry.insert(count), - }; - } - } - } - - self.index.put_field_distribution(self.wtxn, &field_distribution)?; - - soft_deleted_docids |= &self.to_delete_docids; - - // We always soft-delete the documents, even if they will be permanently - // deleted immediately after. - self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?; - - // decide for a hard or soft deletion depending on the strategy - let soft_deletion = match self.strategy { - DeletionStrategy::Dynamic => { - // decide to keep the soft deleted in the DB for now if they meet 2 criteria: - // 1. There is less than a fixed rate of 50% of soft-deleted to actual documents, *and* - // 2. Soft-deleted occupy an average of less than a fixed size on disk - - let size_used = self.index.used_size()?; - let nb_documents = self.index.number_of_documents(self.wtxn)?; - let nb_soft_deleted = soft_deleted_docids.len(); - - (nb_soft_deleted < nb_documents) && { - const SOFT_DELETED_SIZE_BYTE_THRESHOLD: u64 = 1_073_741_824; // 1GiB - - // nb_documents + nb_soft_deleted !=0 because if nb_documents is 0 we short-circuit earlier, and then we moved the documents to delete - // from the documents_docids to the soft_deleted_docids. - let estimated_document_size = size_used / (nb_documents + nb_soft_deleted); - let estimated_size_used_by_soft_deleted = - estimated_document_size * nb_soft_deleted; - estimated_size_used_by_soft_deleted < SOFT_DELETED_SIZE_BYTE_THRESHOLD - } - } - DeletionStrategy::AlwaysSoft => true, - DeletionStrategy::AlwaysHard => false, - }; - - if soft_deletion { - // Keep the soft-deleted in the DB - return Ok(DetailedDocumentDeletionResult { - deleted_documents: self.to_delete_docids.len(), - remaining_documents: documents_ids.len(), - }); - } - - self.to_delete_docids = soft_deleted_docids; - - let Index { - env: _env, - main: _main, - word_docids, - exact_word_docids, - word_prefix_docids, - exact_word_prefix_docids, - word_pair_proximity_docids, - field_id_word_count_docids, - word_prefix_pair_proximity_docids, - prefix_word_pair_proximity_docids, - word_position_docids, - word_prefix_position_docids, - word_fid_docids, - word_prefix_fid_docids, - facet_id_f64_docids: _, - facet_id_string_docids: _, - facet_id_normalized_string_strings: _, - facet_id_string_fst: _, - field_id_docid_facet_f64s: _, - field_id_docid_facet_strings: _, - script_language_docids, - facet_id_exists_docids, - facet_id_is_null_docids, - facet_id_is_empty_docids, - vector_id_docid, - documents, - } = self.index; - // Remove from the documents database - for docid in &self.to_delete_docids { - documents.delete(self.wtxn, &BEU32::new(docid))?; - } - // We acquire the current external documents ids map... - // Note that its soft-deleted document ids field will be equal to the `to_delete_docids` - //let mut new_external_documents_ids = self.index.external_documents_ids(self.wtxn)?; - // We then remove the soft-deleted docids from it - //new_external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?; - // and write it back to the main database. - //let new_external_documents_ids = new_external_documents_ids.into_static(); - //self.index.put_external_documents_ids(self.wtxn, &new_external_documents_ids)?; - - todo!("please autobatch deletions for now"); - - let mut words_to_keep = BTreeSet::default(); - let mut words_to_delete = BTreeSet::default(); - // We iterate over the words and delete the documents ids - // from the word docids database. - remove_from_word_docids( - self.wtxn, - word_docids, - &self.to_delete_docids, - &mut words_to_keep, - &mut words_to_delete, - )?; - remove_from_word_docids( - self.wtxn, - exact_word_docids, - &self.to_delete_docids, - &mut words_to_keep, - &mut words_to_delete, - )?; - - // We construct an FST set that contains the words to delete from the words FST. - let words_to_delete = fst::Set::from_iter(words_to_delete.difference(&words_to_keep))?; - - let new_words_fst = { - // We retrieve the current words FST from the database. - let words_fst = self.index.words_fst(self.wtxn)?; - let difference = words_fst.op().add(&words_to_delete).difference(); - - // We stream the new external ids that does no more contains the to-delete external ids. - let mut new_words_fst_builder = fst::SetBuilder::memory(); - new_words_fst_builder.extend_stream(difference.into_stream())?; - - // We create an words FST set from the above builder. - new_words_fst_builder.into_set() - }; - - // We write the new words FST into the main database. - self.index.put_words_fst(self.wtxn, &new_words_fst)?; - - let prefixes_to_delete = - remove_from_word_prefix_docids(self.wtxn, word_prefix_docids, &self.to_delete_docids)?; - - let exact_prefix_to_delete = remove_from_word_prefix_docids( - self.wtxn, - exact_word_prefix_docids, - &self.to_delete_docids, - )?; - - let all_prefixes_to_delete = prefixes_to_delete.op().add(&exact_prefix_to_delete).union(); - - // We compute the new prefix FST and write it only if there is a change. - if !prefixes_to_delete.is_empty() || !exact_prefix_to_delete.is_empty() { - let new_words_prefixes_fst = { - // We retrieve the current words prefixes FST from the database. - let words_prefixes_fst = self.index.words_prefixes_fst(self.wtxn)?; - let difference = - words_prefixes_fst.op().add(all_prefixes_to_delete.into_stream()).difference(); - - // We stream the new external ids that does no more contains the to-delete external ids. - let mut new_words_prefixes_fst_builder = fst::SetBuilder::memory(); - new_words_prefixes_fst_builder.extend_stream(difference.into_stream())?; - - // We create an words FST set from the above builder. - new_words_prefixes_fst_builder.into_set() - }; - - // We write the new words prefixes FST into the main database. - self.index.put_words_prefixes_fst(self.wtxn, &new_words_prefixes_fst)?; - } - - for db in [word_prefix_pair_proximity_docids, prefix_word_pair_proximity_docids] { - // We delete the documents ids from the word prefix pair proximity database docids - // and remove the empty pairs too. - Self::delete_from_db(db.iter_mut(self.wtxn)?.remap_key_type(), &self.to_delete_docids)?; - } - Self::delete_from_db( - word_pair_proximity_docids.iter_mut(self.wtxn)?.remap_key_type(), - &self.to_delete_docids, - )?; - Self::delete_from_db( - word_position_docids.iter_mut(self.wtxn)?.remap_key_type(), - &self.to_delete_docids, - )?; - Self::delete_from_db( - word_prefix_position_docids.iter_mut(self.wtxn)?.remap_key_type(), - &self.to_delete_docids, - )?; - Self::delete_from_db( - word_fid_docids.iter_mut(self.wtxn)?.remap_key_type(), - &self.to_delete_docids, - )?; - Self::delete_from_db( - word_prefix_fid_docids.iter_mut(self.wtxn)?.remap_key_type(), - &self.to_delete_docids, - )?; - - // Remove the documents ids from the field id word count database. - Self::delete_from_db( - field_id_word_count_docids.iter_mut(self.wtxn)?.remap_key_type(), - &self.to_delete_docids, - )?; - - if let Some(mut rtree) = self.index.geo_rtree(self.wtxn)? { - let mut geo_faceted_doc_ids = self.index.geo_faceted_documents_ids(self.wtxn)?; - - let (points_to_remove, docids_to_remove): (Vec<_>, RoaringBitmap) = rtree - .iter() - .filter(|&point| self.to_delete_docids.contains(point.data.0)) - .cloned() - .map(|point| (point, point.data.0)) - .unzip(); - points_to_remove.iter().for_each(|point| { - rtree.remove(point); - }); - geo_faceted_doc_ids -= docids_to_remove; - - self.index.put_geo_rtree(self.wtxn, &rtree)?; - self.index.put_geo_faceted_documents_ids(self.wtxn, &geo_faceted_doc_ids)?; - } - - for facet_type in [FacetType::Number, FacetType::String] { - let mut affected_facet_values = HashMap::new(); - for field_id in self.index.faceted_fields_ids(self.wtxn)? { - let facet_values = remove_docids_from_field_id_docid_facet_value( - self.index, - self.wtxn, - facet_type, - field_id, - &self.to_delete_docids, - )?; - if !facet_values.is_empty() { - affected_facet_values.insert(field_id, facet_values); - } - } - FacetsDelete::new( - self.index, - facet_type, - affected_facet_values, - &self.to_delete_docids, - ) - .execute(self.wtxn)?; - } - - // Remove the documents ids from the script language database. - Self::delete_from_db( - script_language_docids.iter_mut(self.wtxn)?.remap_key_type(), - &self.to_delete_docids, - )?; - // We delete the documents ids that are under the facet field id values. - remove_docids_from_facet_id_docids( - self.wtxn, - facet_id_exists_docids, - &self.to_delete_docids, - )?; - - // We delete the documents ids that are under the facet field id values. - remove_docids_from_facet_id_docids( - self.wtxn, - facet_id_is_null_docids, - &self.to_delete_docids, - )?; - - // We delete the documents ids that are under the facet field id values. - remove_docids_from_facet_id_docids( - self.wtxn, - facet_id_is_empty_docids, - &self.to_delete_docids, - )?; - - // An ugly and slow way to remove the vectors from the HNSW - // It basically reconstructs the HNSW from scratch without editing the current one. - if let Some(current_hnsw) = self.index.vector_hnsw(self.wtxn)? { - let mut points = Vec::new(); - let mut docids = Vec::new(); - for result in vector_id_docid.iter(self.wtxn)? { - let (vector_id, docid) = result?; - if !self.to_delete_docids.contains(docid.get()) { - let pid = PointId::from(vector_id.get()); - let vector = current_hnsw[pid].clone(); - points.push(vector); - docids.push(docid); - } - } - - let (new_hnsw, pids) = Hnsw::builder().build_hnsw(points); - - vector_id_docid.clear(self.wtxn)?; - for (pid, docid) in pids.into_iter().zip(docids) { - vector_id_docid.put(self.wtxn, &BEU32::new(pid.into_inner()), &docid)?; - } - self.index.put_vector_hnsw(self.wtxn, &new_hnsw)?; - } - - self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?; - - Ok(DetailedDocumentDeletionResult { - deleted_documents: self.to_delete_docids.len(), - remaining_documents: documents_ids.len(), - }) - } - - fn delete_from_db( - mut iter: RwIter, C>, - to_delete_docids: &RoaringBitmap, - ) -> Result<()> - where - C: for<'a> BytesDecode<'a, DItem = RoaringBitmap> - + for<'a> BytesEncode<'a, EItem = RoaringBitmap>, - { - puffin::profile_function!(); - - while let Some(result) = iter.next() { - let (bytes, mut docids) = result?; - let previous_len = docids.len(); - docids -= to_delete_docids; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } else if docids.len() != previous_len { - let bytes = bytes.to_owned(); - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&bytes, &docids)? }; - } - } - Ok(()) - } -} - -fn remove_from_word_prefix_docids( - txn: &mut heed::RwTxn, - db: &Database, - to_remove: &RoaringBitmap, -) -> Result>> { - puffin::profile_function!(); - - let mut prefixes_to_delete = fst::SetBuilder::memory(); - - // We iterate over the word prefix docids database and remove the deleted documents ids - // from every docids lists. We register the empty prefixes in an fst Set for futur deletion. - let mut iter = db.iter_mut(txn)?; - while let Some(result) = iter.next() { - let (prefix, mut docids) = result?; - let prefix = prefix.to_owned(); - let previous_len = docids.len(); - docids -= to_remove; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - prefixes_to_delete.insert(prefix)?; - } else if docids.len() != previous_len { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&prefix, &docids)? }; - } - } - - Ok(prefixes_to_delete.into_set()) -} - -fn remove_from_word_docids( - txn: &mut heed::RwTxn, - db: &heed::Database, - to_remove: &RoaringBitmap, - words_to_keep: &mut BTreeSet, - words_to_remove: &mut BTreeSet, -) -> Result<()> { - puffin::profile_function!(); - - // We create an iterator to be able to get the content and delete the word docids. - // It's faster to acquire a cursor to get and delete or put, as we avoid traversing - // the LMDB B-Tree two times but only once. - let mut iter = db.iter_mut(txn)?; - while let Some((key, mut docids)) = iter.next().transpose()? { - let previous_len = docids.len(); - docids -= to_remove; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - words_to_remove.insert(key.to_owned()); - } else { - words_to_keep.insert(key.to_owned()); - if docids.len() != previous_len { - let key = key.to_owned(); - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&key, &docids)? }; - } - } - } - - Ok(()) -} - -fn remove_docids_from_field_id_docid_facet_value( - index: &Index, - wtxn: &mut heed::RwTxn, - facet_type: FacetType, - field_id: FieldId, - to_remove: &RoaringBitmap, -) -> heed::Result>> { - puffin::profile_function!(); - - let db = match facet_type { - FacetType::String => { - index.field_id_docid_facet_strings.remap_types::() - } - FacetType::Number => { - index.field_id_docid_facet_f64s.remap_types::() - } - }; - let mut all_affected_facet_values = HashSet::default(); - let mut iter = db - .prefix_iter_mut(wtxn, &field_id.to_be_bytes())? - .remap_key_type::>(); - - while let Some(result) = iter.next() { - let ((_, docid, facet_value), _) = result?; - if to_remove.contains(docid) { - if !all_affected_facet_values.contains(facet_value) { - all_affected_facet_values.insert(facet_value.to_owned()); - } - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } - } - - Ok(all_affected_facet_values) -} - -fn remove_docids_from_facet_id_docids<'a, C>( - wtxn: &'a mut heed::RwTxn, - db: &heed::Database, - to_remove: &RoaringBitmap, -) -> heed::Result<()> -where - C: heed::BytesDecode<'a> + heed::BytesEncode<'a>, -{ - puffin::profile_function!(); - - let mut iter = db.remap_key_type::().iter_mut(wtxn)?; - while let Some(result) = iter.next() { - let (bytes, mut docids) = result?; - let previous_len = docids.len(); - docids -= to_remove; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } else if docids.len() != previous_len { - let bytes = bytes.to_owned(); - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&bytes, &docids)? }; - } - } - - Ok(()) -} - -#[cfg(test)] -mod tests { - use big_s::S; - use heed::RwTxn; - use maplit::hashset; - - use super::*; - use crate::index::tests::TempIndex; - use crate::{db_snap, Filter, Search}; - - fn delete_documents<'t>( - wtxn: &mut RwTxn<'t, '_>, - index: &'t Index, - external_ids: &[&str], - strategy: DeletionStrategy, - ) -> Vec { - let external_document_ids = index.external_documents_ids(wtxn).unwrap(); - let ids_to_delete: Vec = external_ids - .iter() - .map(|id| external_document_ids.get(id.as_bytes()).unwrap()) - .collect(); - - // Delete some documents. - let mut builder = DeleteDocuments::new(wtxn, index).unwrap(); - builder.strategy(strategy); - external_ids.iter().for_each(|id| { - builder.delete_external_id(id); - }); - builder.execute().unwrap(); - - ids_to_delete - } - - fn delete_documents_with_numbers_as_primary_key_(deletion_strategy: DeletionStrategy) { - let index = TempIndex::new(); - - let mut wtxn = index.write_txn().unwrap(); - index - .add_documents_using_wtxn( - &mut wtxn, - documents!([ - { "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } }, - { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] }, - { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] } - ]), - ) - .unwrap(); - - // delete those documents, ids are synchronous therefore 0, 1, and 2. - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.delete_document(0); - builder.delete_document(1); - builder.delete_document(2); - builder.strategy(deletion_strategy); - builder.execute().unwrap(); - - wtxn.commit().unwrap(); - - // All these snapshots should be empty since the database was cleared - db_snap!(index, documents_ids, deletion_strategy); - db_snap!(index, word_docids, deletion_strategy); - db_snap!(index, word_pair_proximity_docids, deletion_strategy); - db_snap!(index, facet_id_exists_docids, deletion_strategy); - db_snap!(index, soft_deleted_documents_ids, deletion_strategy); - - let rtxn = index.read_txn().unwrap(); - - assert!(index.field_distribution(&rtxn).unwrap().is_empty()); - } - - #[test] - fn delete_documents_with_numbers_as_primary_key() { - delete_documents_with_numbers_as_primary_key_(DeletionStrategy::AlwaysHard); - delete_documents_with_numbers_as_primary_key_(DeletionStrategy::AlwaysSoft); - } - - fn delete_documents_with_strange_primary_key_(strategy: DeletionStrategy) { - let index = TempIndex::new(); - - index - .update_settings(|settings| settings.set_searchable_fields(vec!["name".to_string()])) - .unwrap(); - - let mut wtxn = index.write_txn().unwrap(); - index - .add_documents_using_wtxn( - &mut wtxn, - documents!([ - { "mysuperid": 0, "name": "kevin" }, - { "mysuperid": 1, "name": "kevina" }, - { "mysuperid": 2, "name": "benoit" } - ]), - ) - .unwrap(); - wtxn.commit().unwrap(); - - let mut wtxn = index.write_txn().unwrap(); - - // Delete not all of the documents but some of them. - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.delete_external_id("0"); - builder.delete_external_id("1"); - builder.strategy(strategy); - builder.execute().unwrap(); - wtxn.commit().unwrap(); - - db_snap!(index, documents_ids, strategy); - db_snap!(index, word_docids, strategy); - db_snap!(index, word_pair_proximity_docids, strategy); - db_snap!(index, soft_deleted_documents_ids, strategy); - } - - #[test] - fn delete_documents_with_strange_primary_key() { - delete_documents_with_strange_primary_key_(DeletionStrategy::AlwaysHard); - delete_documents_with_strange_primary_key_(DeletionStrategy::AlwaysSoft); - } - - fn filtered_placeholder_search_should_not_return_deleted_documents_( - deletion_strategy: DeletionStrategy, - ) { - let index = TempIndex::new(); - - let mut wtxn = index.write_txn().unwrap(); - - index - .update_settings_using_wtxn(&mut wtxn, |settings| { - settings.set_primary_key(S("docid")); - settings.set_filterable_fields(hashset! { S("label"), S("label2") }); - }) - .unwrap(); - - index - .add_documents_using_wtxn( - &mut wtxn, - documents!([ - { "docid": "1_4", "label": ["sign"] }, - { "docid": "1_5", "label": ["letter"] }, - { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, - { "docid": "1_36", "label": ["drawing","painting","pattern"] }, - { "docid": "1_37", "label": ["art","drawing","outdoor"] }, - { "docid": "1_38", "label": ["aquarium","art","drawing"] }, - { "docid": "1_39", "label": ["abstract"] }, - { "docid": "1_40", "label": ["cartoon"] }, - { "docid": "1_41", "label": ["art","drawing"] }, - { "docid": "1_42", "label": ["art","pattern"] }, - { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, - { "docid": "1_44", "label": ["drawing"] }, - { "docid": "1_45", "label": ["art"] }, - { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, - { "docid": "1_47", "label": ["abstract","pattern"] }, - { "docid": "1_52", "label": ["abstract","cartoon"] }, - { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, - { "docid": "1_58", "label": ["abstract","art","cartoon"] }, - { "docid": "1_68", "label": ["design"] }, - { "docid": "1_69", "label": ["geometry"] }, - { "docid": "1_70", "label2": ["geometry", 1.2] }, - { "docid": "1_71", "label2": ["design", 2.2] }, - { "docid": "1_72", "label2": ["geometry", 1.2] } - ]), - ) - .unwrap(); - - delete_documents(&mut wtxn, &index, &["1_4", "1_70", "1_72"], deletion_strategy); - - // Placeholder search with filter - let filter = Filter::from_str("label = sign").unwrap().unwrap(); - let results = index.search(&wtxn).filter(filter).execute().unwrap(); - assert!(results.documents_ids.is_empty()); - - wtxn.commit().unwrap(); - - db_snap!(index, soft_deleted_documents_ids, deletion_strategy); - db_snap!(index, word_docids, deletion_strategy); - db_snap!(index, facet_id_f64_docids, deletion_strategy); - db_snap!(index, word_pair_proximity_docids, deletion_strategy); - db_snap!(index, facet_id_exists_docids, deletion_strategy); - db_snap!(index, facet_id_string_docids, deletion_strategy); - } - - #[test] - fn filtered_placeholder_search_should_not_return_deleted_documents() { - filtered_placeholder_search_should_not_return_deleted_documents_( - DeletionStrategy::AlwaysHard, - ); - filtered_placeholder_search_should_not_return_deleted_documents_( - DeletionStrategy::AlwaysSoft, - ); - } - - fn placeholder_search_should_not_return_deleted_documents_( - deletion_strategy: DeletionStrategy, - ) { - let index = TempIndex::new(); - - let mut wtxn = index.write_txn().unwrap(); - index - .update_settings_using_wtxn(&mut wtxn, |settings| { - settings.set_primary_key(S("docid")); - }) - .unwrap(); - - index - .add_documents_using_wtxn( - &mut wtxn, - documents!([ - { "docid": "1_4", "label": ["sign"] }, - { "docid": "1_5", "label": ["letter"] }, - { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, - { "docid": "1_36", "label": ["drawing","painting","pattern"] }, - { "docid": "1_37", "label": ["art","drawing","outdoor"] }, - { "docid": "1_38", "label": ["aquarium","art","drawing"] }, - { "docid": "1_39", "label": ["abstract"] }, - { "docid": "1_40", "label": ["cartoon"] }, - { "docid": "1_41", "label": ["art","drawing"] }, - { "docid": "1_42", "label": ["art","pattern"] }, - { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, - { "docid": "1_44", "label": ["drawing"] }, - { "docid": "1_45", "label": ["art"] }, - { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, - { "docid": "1_47", "label": ["abstract","pattern"] }, - { "docid": "1_52", "label": ["abstract","cartoon"] }, - { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, - { "docid": "1_58", "label": ["abstract","art","cartoon"] }, - { "docid": "1_68", "label": ["design"] }, - { "docid": "1_69", "label": ["geometry"] }, - { "docid": "1_70", "label2": ["geometry", 1.2] }, - { "docid": "1_71", "label2": ["design", 2.2] }, - { "docid": "1_72", "label2": ["geometry", 1.2] } - ]), - ) - .unwrap(); - - let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_4"], deletion_strategy); - - // Placeholder search - let results = index.search(&wtxn).execute().unwrap(); - assert!(!results.documents_ids.is_empty()); - for id in results.documents_ids.iter() { - assert!( - !deleted_internal_ids.contains(id), - "The document {} was supposed to be deleted", - id - ); - } - - wtxn.commit().unwrap(); - } - - #[test] - fn placeholder_search_should_not_return_deleted_documents() { - placeholder_search_should_not_return_deleted_documents_(DeletionStrategy::AlwaysHard); - placeholder_search_should_not_return_deleted_documents_(DeletionStrategy::AlwaysSoft); - } - - fn search_should_not_return_deleted_documents_(deletion_strategy: DeletionStrategy) { - let index = TempIndex::new(); - - let mut wtxn = index.write_txn().unwrap(); - index - .update_settings_using_wtxn(&mut wtxn, |settings| { - settings.set_primary_key(S("docid")); - }) - .unwrap(); - - index - .add_documents_using_wtxn( - &mut wtxn, - documents!([ - { "docid": "1_4", "label": ["sign"] }, - { "docid": "1_5", "label": ["letter"] }, - { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, - { "docid": "1_36", "label": ["drawing","painting","pattern"] }, - { "docid": "1_37", "label": ["art","drawing","outdoor"] }, - { "docid": "1_38", "label": ["aquarium","art","drawing"] }, - { "docid": "1_39", "label": ["abstract"] }, - { "docid": "1_40", "label": ["cartoon"] }, - { "docid": "1_41", "label": ["art","drawing"] }, - { "docid": "1_42", "label": ["art","pattern"] }, - { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, - { "docid": "1_44", "label": ["drawing"] }, - { "docid": "1_45", "label": ["art"] }, - { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, - { "docid": "1_47", "label": ["abstract","pattern"] }, - { "docid": "1_52", "label": ["abstract","cartoon"] }, - { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, - { "docid": "1_58", "label": ["abstract","art","cartoon"] }, - { "docid": "1_68", "label": ["design"] }, - { "docid": "1_69", "label": ["geometry"] }, - { "docid": "1_70", "label2": ["geometry", 1.2] }, - { "docid": "1_71", "label2": ["design", 2.2] }, - { "docid": "1_72", "label2": ["geometry", 1.2] } - ]), - ) - .unwrap(); - - let deleted_internal_ids = - delete_documents(&mut wtxn, &index, &["1_7", "1_52"], deletion_strategy); - - // search for abstract - let results = index.search(&wtxn).query("abstract").execute().unwrap(); - assert!(!results.documents_ids.is_empty()); - for id in results.documents_ids.iter() { - assert!( - !deleted_internal_ids.contains(id), - "The document {} was supposed to be deleted", - id - ); - } - - wtxn.commit().unwrap(); - - db_snap!(index, soft_deleted_documents_ids, deletion_strategy); - } - - #[test] - fn search_should_not_return_deleted_documents() { - search_should_not_return_deleted_documents_(DeletionStrategy::AlwaysHard); - search_should_not_return_deleted_documents_(DeletionStrategy::AlwaysSoft); - } - - fn geo_filtered_placeholder_search_should_not_return_deleted_documents_( - deletion_strategy: DeletionStrategy, - ) { - let index = TempIndex::new(); - - let mut wtxn = index.write_txn().unwrap(); - index - .update_settings_using_wtxn(&mut wtxn, |settings| { - settings.set_primary_key(S("id")); - settings.set_filterable_fields(hashset!(S("_geo"))); - settings.set_sortable_fields(hashset!(S("_geo"))); - }) - .unwrap(); - - index.add_documents_using_wtxn(&mut wtxn, documents!([ - { "id": "1", "city": "Lille", "_geo": { "lat": 50.6299, "lng": 3.0569 } }, - { "id": "2", "city": "Mons-en-Barœul", "_geo": { "lat": 50.6415, "lng": 3.1106 } }, - { "id": "3", "city": "Hellemmes", "_geo": { "lat": 50.6312, "lng": 3.1106 } }, - { "id": "4", "city": "Villeneuve-d'Ascq", "_geo": { "lat": 50.6224, "lng": 3.1476 } }, - { "id": "5", "city": "Hem", "_geo": { "lat": 50.6552, "lng": 3.1897 } }, - { "id": "6", "city": "Roubaix", "_geo": { "lat": 50.6924, "lng": 3.1763 } }, - { "id": "7", "city": "Tourcoing", "_geo": { "lat": 50.7263, "lng": 3.1541 } }, - { "id": "8", "city": "Mouscron", "_geo": { "lat": 50.7453, "lng": 3.2206 } }, - { "id": "9", "city": "Tournai", "_geo": { "lat": 50.6053, "lng": 3.3758 } }, - { "id": "10", "city": "Ghent", "_geo": { "lat": 51.0537, "lng": 3.6957 } }, - { "id": "11", "city": "Brussels", "_geo": { "lat": 50.8466, "lng": 4.3370 } }, - { "id": "12", "city": "Charleroi", "_geo": { "lat": 50.4095, "lng": 4.4347 } }, - { "id": "13", "city": "Mons", "_geo": { "lat": 50.4502, "lng": 3.9623 } }, - { "id": "14", "city": "Valenciennes", "_geo": { "lat": 50.3518, "lng": 3.5326 } }, - { "id": "15", "city": "Arras", "_geo": { "lat": 50.2844, "lng": 2.7637 } }, - { "id": "16", "city": "Cambrai", "_geo": { "lat": 50.1793, "lng": 3.2189 } }, - { "id": "17", "city": "Bapaume", "_geo": { "lat": 50.1112, "lng": 2.8547 } }, - { "id": "18", "city": "Amiens", "_geo": { "lat": 49.9314, "lng": 2.2710 } }, - { "id": "19", "city": "Compiègne", "_geo": { "lat": 49.4449, "lng": 2.7913 } }, - { "id": "20", "city": "Paris", "_geo": { "lat": 48.9021, "lng": 2.3708 } } - ])).unwrap(); - - let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"]; - let deleted_internal_ids = - delete_documents(&mut wtxn, &index, &external_ids_to_delete, deletion_strategy); - - // Placeholder search with geo filter - let filter = Filter::from_str("_geoRadius(50.6924, 3.1763, 20000)").unwrap().unwrap(); - let results = index.search(&wtxn).filter(filter).execute().unwrap(); - assert!(!results.documents_ids.is_empty()); - for id in results.documents_ids.iter() { - assert!( - !deleted_internal_ids.contains(id), - "The document {} was supposed to be deleted", - id - ); - } - - wtxn.commit().unwrap(); - - db_snap!(index, soft_deleted_documents_ids, deletion_strategy); - db_snap!(index, facet_id_f64_docids, deletion_strategy); - db_snap!(index, facet_id_string_docids, deletion_strategy); - } - - #[test] - fn geo_filtered_placeholder_search_should_not_return_deleted_documents() { - geo_filtered_placeholder_search_should_not_return_deleted_documents_( - DeletionStrategy::AlwaysHard, - ); - geo_filtered_placeholder_search_should_not_return_deleted_documents_( - DeletionStrategy::AlwaysSoft, - ); - } - - fn get_documents_should_not_return_deleted_documents_(deletion_strategy: DeletionStrategy) { - let index = TempIndex::new(); - - let mut wtxn = index.write_txn().unwrap(); - index - .update_settings_using_wtxn(&mut wtxn, |settings| { - settings.set_primary_key(S("docid")); - }) - .unwrap(); - - index - .add_documents_using_wtxn( - &mut wtxn, - documents!([ - { "docid": "1_4", "label": ["sign"] }, - { "docid": "1_5", "label": ["letter"] }, - { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, - { "docid": "1_36", "label": ["drawing","painting","pattern"] }, - { "docid": "1_37", "label": ["art","drawing","outdoor"] }, - { "docid": "1_38", "label": ["aquarium","art","drawing"] }, - { "docid": "1_39", "label": ["abstract"] }, - { "docid": "1_40", "label": ["cartoon"] }, - { "docid": "1_41", "label": ["art","drawing"] }, - { "docid": "1_42", "label": ["art","pattern"] }, - { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, - { "docid": "1_44", "label": ["drawing"] }, - { "docid": "1_45", "label": ["art"] }, - { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, - { "docid": "1_47", "label": ["abstract","pattern"] }, - { "docid": "1_52", "label": ["abstract","cartoon"] }, - { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, - { "docid": "1_58", "label": ["abstract","art","cartoon"] }, - { "docid": "1_68", "label": ["design"] }, - { "docid": "1_69", "label": ["geometry"] }, - { "docid": "1_70", "label2": ["geometry", 1.2] }, - { "docid": "1_71", "label2": ["design", 2.2] }, - { "docid": "1_72", "label2": ["geometry", 1.2] } - ]), - ) - .unwrap(); - - let deleted_external_ids = ["1_7", "1_52"]; - let deleted_internal_ids = - delete_documents(&mut wtxn, &index, &deleted_external_ids, deletion_strategy); - - // list all documents - let results = index.all_documents(&wtxn).unwrap(); - for result in results { - let (id, _) = result.unwrap(); - assert!( - !deleted_internal_ids.contains(&id), - "The document {} was supposed to be deleted", - id - ); - } - - // list internal document ids - let results = index.documents_ids(&wtxn).unwrap(); - for id in results { - assert!( - !deleted_internal_ids.contains(&id), - "The document {} was supposed to be deleted", - id - ); - } - wtxn.commit().unwrap(); - - let rtxn = index.read_txn().unwrap(); - - // get internal docids from deleted external document ids - let results = index.external_documents_ids(&rtxn).unwrap(); - for id in deleted_external_ids { - assert!(results.get(id).is_none(), "The document {} was supposed to be deleted", id); - } - drop(rtxn); - - db_snap!(index, soft_deleted_documents_ids, deletion_strategy); - } - - #[test] - fn get_documents_should_not_return_deleted_documents() { - get_documents_should_not_return_deleted_documents_(DeletionStrategy::AlwaysHard); - get_documents_should_not_return_deleted_documents_(DeletionStrategy::AlwaysSoft); - } - - fn stats_should_not_return_deleted_documents_(deletion_strategy: DeletionStrategy) { - let index = TempIndex::new(); - - let mut wtxn = index.write_txn().unwrap(); - - index - .update_settings_using_wtxn(&mut wtxn, |settings| { - settings.set_primary_key(S("docid")); - }) - .unwrap(); - - index.add_documents_using_wtxn(&mut wtxn, documents!([ - { "docid": "1_4", "label": ["sign"]}, - { "docid": "1_5", "label": ["letter"]}, - { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"], "title": "Mickey Mouse"}, - { "docid": "1_36", "label": ["drawing","painting","pattern"]}, - { "docid": "1_37", "label": ["art","drawing","outdoor"]}, - { "docid": "1_38", "label": ["aquarium","art","drawing"], "title": "Nemo"}, - { "docid": "1_39", "label": ["abstract"]}, - { "docid": "1_40", "label": ["cartoon"]}, - { "docid": "1_41", "label": ["art","drawing"]}, - { "docid": "1_42", "label": ["art","pattern"]}, - { "docid": "1_43", "label": ["abstract","art","drawing","pattern"], "number": 32i32}, - { "docid": "1_44", "label": ["drawing"], "number": 44i32}, - { "docid": "1_45", "label": ["art"]}, - { "docid": "1_46", "label": ["abstract","colorfulness","pattern"]}, - { "docid": "1_47", "label": ["abstract","pattern"]}, - { "docid": "1_52", "label": ["abstract","cartoon"]}, - { "docid": "1_57", "label": ["abstract","drawing","pattern"]}, - { "docid": "1_58", "label": ["abstract","art","cartoon"]}, - { "docid": "1_68", "label": ["design"]}, - { "docid": "1_69", "label": ["geometry"]} - ])).unwrap(); - - delete_documents(&mut wtxn, &index, &["1_7", "1_52"], deletion_strategy); - - // count internal documents - let results = index.number_of_documents(&wtxn).unwrap(); - assert_eq!(18, results); - - // count field distribution - let results = index.field_distribution(&wtxn).unwrap(); - assert_eq!(Some(&18), results.get("label")); - assert_eq!(Some(&1), results.get("title")); - assert_eq!(Some(&2), results.get("number")); - - wtxn.commit().unwrap(); - - db_snap!(index, soft_deleted_documents_ids, deletion_strategy); - } - - #[test] - fn stats_should_not_return_deleted_documents() { - stats_should_not_return_deleted_documents_(DeletionStrategy::AlwaysHard); - stats_should_not_return_deleted_documents_(DeletionStrategy::AlwaysSoft); - } - - fn stored_detected_script_and_language_should_not_return_deleted_documents_( - deletion_strategy: DeletionStrategy, - ) { - use charabia::{Language, Script}; - let index = TempIndex::new(); - let mut wtxn = index.write_txn().unwrap(); - index - .add_documents_using_wtxn( - &mut wtxn, - documents!([ - { "id": "0", "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" }, - { "id": "1", "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" }, - { "id": "2", "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" }, - { "id": "3", "title": "関西国際空港限定トートバッグ すもももももももものうち" }, - { "id": "4", "title": "ภาษาไทยง่ายนิดเดียว" }, - { "id": "5", "title": "The quick 在尊嚴和權利上一律平等。" }, - ])) - .unwrap(); - - let key_cmn = (Script::Cj, Language::Cmn); - let cj_cmn_docs = - index.script_language_documents_ids(&wtxn, &key_cmn).unwrap().unwrap_or_default(); - let mut expected_cj_cmn_docids = RoaringBitmap::new(); - expected_cj_cmn_docids.push(1); - expected_cj_cmn_docids.push(5); - assert_eq!(cj_cmn_docs, expected_cj_cmn_docids); - - delete_documents(&mut wtxn, &index, &["1"], deletion_strategy); - wtxn.commit().unwrap(); - - let rtxn = index.read_txn().unwrap(); - let cj_cmn_docs = - index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap_or_default(); - let mut expected_cj_cmn_docids = RoaringBitmap::new(); - expected_cj_cmn_docids.push(5); - assert_eq!(cj_cmn_docs, expected_cj_cmn_docids); - } - - #[test] - fn stored_detected_script_and_language_should_not_return_deleted_documents() { - stored_detected_script_and_language_should_not_return_deleted_documents_( - DeletionStrategy::AlwaysHard, - ); - stored_detected_script_and_language_should_not_return_deleted_documents_( - DeletionStrategy::AlwaysSoft, - ); - } - - #[test] - fn delete_words_exact_attributes() { - let index = TempIndex::new(); - - index - .update_settings(|settings| { - settings.set_primary_key(S("id")); - settings.set_searchable_fields(vec![S("text"), S("exact")]); - settings.set_exact_attributes(vec![S("exact")].into_iter().collect()); - }) - .unwrap(); - - index - .add_documents(documents!([ - { "id": 0, "text": "hello" }, - { "id": 1, "exact": "hello"} - ])) - .unwrap(); - db_snap!(index, word_docids, 1, @r###" - hello [0, ] - "###); - db_snap!(index, exact_word_docids, 1, @r###" - hello [1, ] - "###); - db_snap!(index, words_fst, 1, @"300000000000000001084cfcfc2ce1000000016000000090ea47f"); - - let mut wtxn = index.write_txn().unwrap(); - let deleted_internal_ids = - delete_documents(&mut wtxn, &index, &["1"], DeletionStrategy::AlwaysHard); - wtxn.commit().unwrap(); - - db_snap!(index, word_docids, 2, @r###" - hello [0, ] - "###); - db_snap!(index, exact_word_docids, 2, @""); - db_snap!(index, words_fst, 2, @"300000000000000001084cfcfc2ce1000000016000000090ea47f"); - - insta::assert_snapshot!(format!("{deleted_internal_ids:?}"), @"[1]"); - let txn = index.read_txn().unwrap(); - let words = index.words_fst(&txn).unwrap().into_stream().into_strs().unwrap(); - insta::assert_snapshot!(format!("{words:?}"), @r###"["hello"]"###); - - let mut s = Search::new(&txn, &index); - s.query("hello"); - let crate::SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); - } -} diff --git a/milli/src/update/facet/delete.rs b/milli/src/update/facet/delete.rs deleted file mode 100644 index 8bd3f196b..000000000 --- a/milli/src/update/facet/delete.rs +++ /dev/null @@ -1,349 +0,0 @@ -use std::collections::{HashMap, HashSet}; - -use heed::RwTxn; -use log::debug; -use roaring::RoaringBitmap; -use time::OffsetDateTime; - -use super::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; -use crate::facet::FacetType; -use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; -use crate::heed_codec::ByteSliceRefCodec; -use crate::update::{FacetsUpdateBulk, FacetsUpdateIncrementalInner}; -use crate::{FieldId, Index, Result}; - -/// A builder used to remove elements from the `facet_id_string_docids` or `facet_id_f64_docids` databases. -/// -/// Depending on the number of removed elements and the existing size of the database, we use either -/// a bulk delete method or an incremental delete method. -pub struct FacetsDelete<'i, 'b> { - index: &'i Index, - database: heed::Database, FacetGroupValueCodec>, - facet_type: FacetType, - affected_facet_values: HashMap>>, - docids_to_delete: &'b RoaringBitmap, - group_size: u8, - max_group_size: u8, - min_level_size: u8, -} -impl<'i, 'b> FacetsDelete<'i, 'b> { - pub fn new( - index: &'i Index, - facet_type: FacetType, - affected_facet_values: HashMap>>, - docids_to_delete: &'b RoaringBitmap, - ) -> Self { - let database = match facet_type { - FacetType::String => index - .facet_id_string_docids - .remap_key_type::>(), - FacetType::Number => { - index.facet_id_f64_docids.remap_key_type::>() - } - }; - Self { - index, - database, - facet_type, - affected_facet_values, - docids_to_delete, - group_size: FACET_GROUP_SIZE, - max_group_size: FACET_MAX_GROUP_SIZE, - min_level_size: FACET_MIN_LEVEL_SIZE, - } - } - - pub fn execute(self, wtxn: &mut RwTxn) -> Result<()> { - debug!("Computing and writing the facet values levels docids into LMDB on disk..."); - self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; - - for (field_id, affected_facet_values) in self.affected_facet_values { - // This is an incorrect condition, since we assume that the length of the database is equal - // to the number of facet values for the given field_id. It means that in some cases, we might - // wrongly choose the incremental indexer over the bulk indexer. But the only case where that could - // really be a performance problem is when we fully delete a large ratio of all facet values for - // each field id. This would almost never happen. Still, to be overly cautious, I have added a - // 2x penalty to the incremental indexer. That is, instead of assuming a 70x worst-case performance - // penalty to the incremental indexer, we assume a 150x worst-case performance penalty instead. - if affected_facet_values.len() >= (self.database.len(wtxn)? / 150) { - // Bulk delete - let mut modified = false; - - for facet_value in affected_facet_values { - let key = - FacetGroupKey { field_id, level: 0, left_bound: facet_value.as_slice() }; - let mut old = self.database.get(wtxn, &key)?.unwrap(); - let previous_len = old.bitmap.len(); - old.bitmap -= self.docids_to_delete; - if old.bitmap.is_empty() { - modified = true; - self.database.delete(wtxn, &key)?; - } else if old.bitmap.len() != previous_len { - modified = true; - self.database.put(wtxn, &key, &old)?; - } - } - if modified { - let builder = FacetsUpdateBulk::new_not_updating_level_0( - self.index, - vec![field_id], - self.facet_type, - ); - builder.execute(wtxn)?; - } - } else { - // Incremental - let inc = FacetsUpdateIncrementalInner { - db: self.database, - group_size: self.group_size, - min_level_size: self.min_level_size, - max_group_size: self.max_group_size, - }; - for facet_value in affected_facet_values { - inc.delete(wtxn, field_id, facet_value.as_slice(), self.docids_to_delete)?; - } - } - } - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use std::iter::FromIterator; - - use big_s::S; - use maplit::hashset; - use rand::seq::SliceRandom; - use rand::SeedableRng; - use roaring::RoaringBitmap; - - use crate::db_snap; - use crate::documents::documents_batch_reader_from_objects; - use crate::index::tests::TempIndex; - use crate::update::facet::test_helpers::ordered_string; - use crate::update::{DeleteDocuments, DeletionStrategy}; - - #[test] - fn delete_mixed_incremental_and_bulk() { - // The point of this test is to create an index populated with documents - // containing different filterable attributes. Then, we delete a bunch of documents - // such that a mix of the incremental and bulk indexer is used (depending on the field id) - let index = TempIndex::new_with_map_size(4096 * 1000 * 100); - - index - .update_settings(|settings| { - settings.set_filterable_fields( - hashset! { S("id"), S("label"), S("timestamp"), S("colour") }, - ); - }) - .unwrap(); - - let mut documents = vec![]; - for i in 0..1000 { - documents.push( - serde_json::json! { - { - "id": i, - "label": i / 10, - "colour": i / 100, - "timestamp": i / 2, - } - } - .as_object() - .unwrap() - .clone(), - ); - } - - let documents = documents_batch_reader_from_objects(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, facet_id_f64_docids, 1, @"550cd138d6fe31ccdd42cd5392fbd576"); - - let mut wtxn = index.env.write_txn().unwrap(); - - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.strategy(DeletionStrategy::AlwaysHard); - builder.delete_documents(&RoaringBitmap::from_iter(0..100)); - // by deleting the first 100 documents, we expect that: - // - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13) - // - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13 - // - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13 - // - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13 - // This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test - builder.execute().unwrap(); - wtxn.commit().unwrap(); - - db_snap!(index, soft_deleted_documents_ids, @"[]"); - db_snap!(index, facet_id_f64_docids, 2, @"d4d5f14e7f1e1f09b86821a0b6defcc6"); - } - - // Same test as above but working with string values for the facets - #[test] - fn delete_mixed_incremental_and_bulk_string() { - // The point of this test is to create an index populated with documents - // containing different filterable attributes. Then, we delete a bunch of documents - // such that a mix of the incremental and bulk indexer is used (depending on the field id) - let index = TempIndex::new_with_map_size(4096 * 1000 * 100); - - index - .update_settings(|settings| { - settings.set_filterable_fields( - hashset! { S("id"), S("label"), S("timestamp"), S("colour") }, - ); - }) - .unwrap(); - - let mut documents = vec![]; - for i in 0..1000 { - documents.push( - serde_json::json! { - { - "id": i, - "label": ordered_string(i / 10), - "colour": ordered_string(i / 100), - "timestamp": ordered_string(i / 2), - } - } - .as_object() - .unwrap() - .clone(), - ); - } - - let documents = documents_batch_reader_from_objects(documents); - index.add_documents(documents).unwrap(); - - // Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022) - db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503"); - - let mut wtxn = index.env.write_txn().unwrap(); - - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.strategy(DeletionStrategy::AlwaysHard); - builder.delete_documents(&RoaringBitmap::from_iter(0..100)); - // by deleting the first 100 documents, we expect that: - // - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13) - // - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13 - // - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13 - // - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13 - // This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test - builder.execute().unwrap(); - wtxn.commit().unwrap(); - - db_snap!(index, soft_deleted_documents_ids, @"[]"); - db_snap!(index, facet_id_string_docids, 2, @"7f9c00b29e04d58c1821202a5dda0ebc"); - } - - #[test] - fn delete_almost_all_incrementally_string() { - let index = TempIndex::new_with_map_size(4096 * 1000 * 100); - - index - .update_settings(|settings| { - settings.set_filterable_fields( - hashset! { S("id"), S("label"), S("timestamp"), S("colour") }, - ); - }) - .unwrap(); - - let mut documents = vec![]; - for i in 0..1000 { - documents.push( - serde_json::json! { - { - "id": i, - "label": ordered_string(i / 10), - "colour": ordered_string(i / 100), - "timestamp": ordered_string(i / 2), - } - } - .as_object() - .unwrap() - .clone(), - ); - } - - let documents = documents_batch_reader_from_objects(documents); - index.add_documents(documents).unwrap(); - - // Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022) - db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503"); - - let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); - - let mut docids_to_delete = (0..1000).collect::>(); - docids_to_delete.shuffle(&mut rng); - for docid in docids_to_delete.into_iter().take(990) { - let mut wtxn = index.env.write_txn().unwrap(); - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.strategy(DeletionStrategy::AlwaysHard); - builder.delete_documents(&RoaringBitmap::from_iter([docid])); - builder.execute().unwrap(); - wtxn.commit().unwrap(); - } - - db_snap!(index, soft_deleted_documents_ids, @"[]"); - db_snap!(index, facet_id_string_docids, 2, @"ece56086e76d50e661fb2b58475b9f7d"); - } -} - -#[allow(unused)] -#[cfg(test)] -mod comparison_bench { - use std::iter::once; - - use rand::Rng; - use roaring::RoaringBitmap; - - use crate::heed_codec::facet::OrderedF64Codec; - use crate::update::facet::test_helpers::FacetIndex; - - // This is a simple test to get an intuition on the relative speed - // of the incremental vs. bulk indexer. - // - // The benchmark shows the worst-case scenario for the incremental indexer, since - // each facet value contains only one document ID. - // - // In that scenario, it appears that the incremental indexer is about 70 times slower than the - // bulk indexer. - // #[test] - fn benchmark_facet_indexing_delete() { - let mut r = rand::thread_rng(); - - for i in 1..=20 { - let size = 50_000 * i; - let index = FacetIndex::::new(4, 8, 5); - - let mut txn = index.env.write_txn().unwrap(); - let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new(); - for i in 0..size { - // field id = 0, left_bound = i, docids = [i] - elements.push(((0, i as f64), once(i).collect())); - } - let timer = std::time::Instant::now(); - index.bulk_insert(&mut txn, &[0], elements.iter()); - let time_spent = timer.elapsed().as_millis(); - println!("bulk {size} : {time_spent}ms"); - - txn.commit().unwrap(); - - for nbr_doc in [1, 100, 1000, 10_000] { - let mut txn = index.env.write_txn().unwrap(); - let timer = std::time::Instant::now(); - // - // delete one document - // - for _ in 0..nbr_doc { - let deleted_u32 = r.gen::() % size; - let deleted_f64 = deleted_u32 as f64; - index.delete_single_docid(&mut txn, 0, &deleted_f64, deleted_u32) - } - let time_spent = timer.elapsed().as_millis(); - println!(" delete {nbr_doc} : {time_spent}ms"); - txn.abort().unwrap(); - } - } - } -} diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 2b671e5cb..f932d5aee 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -98,7 +98,6 @@ use crate::update::merge_btreeset_string; use crate::{BEU16StrCodec, Index, Result, BEU16, MAX_FACET_VALUE_LENGTH}; pub mod bulk; -pub mod delete; pub mod incremental; /// A builder used to add new elements to the `facet_id_string_docids` or `facet_id_f64_docids` databases. diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 6224995a3..97d802d03 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -22,7 +22,6 @@ pub use self::words_prefixes_fst::WordsPrefixesFst; mod available_documents_ids; mod clear_documents; pub(crate) mod del_add; -mod delete_documents; pub(crate) mod facet; mod index_documents; mod indexer_config; From 2263dff02bf7ba62b410b9377bae14e7cd484f79 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 25 Oct 2023 13:40:46 +0200 Subject: [PATCH 054/127] Stop using removed delete pipelines almost everywhere --- benchmarks/benches/indexing.rs | 4 +-- milli/src/index.rs | 32 ++++++++++------------- milli/src/update/facet/mod.rs | 4 --- milli/src/update/index_documents/mod.rs | 7 ++--- milli/src/update/mod.rs | 1 - milli/src/update/prefix_word_pairs/mod.rs | 8 +----- 6 files changed, 18 insertions(+), 38 deletions(-) diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index 9446c0b0f..cb220a5f0 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -6,9 +6,7 @@ use std::path::Path; use criterion::{criterion_group, criterion_main, Criterion}; use milli::heed::{EnvOpenOptions, RwTxn}; -use milli::update::{ - DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings, -}; +use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use milli::Index; use rand::seq::SliceRandom; use rand_chacha::rand_core::SeedableRng; diff --git a/milli/src/index.rs b/milli/src/index.rs index 61ec41788..3e48f5eb1 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1469,8 +1469,7 @@ pub(crate) mod tests { use crate::error::{Error, InternalError}; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; use crate::update::{ - self, DeleteDocuments, DeletionStrategy, IndexDocuments, IndexDocumentsConfig, - IndexDocumentsMethod, IndexerConfig, Settings, + self, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings, }; use crate::{db_snap, obkv_to_json, Filter, Index, Search, SearchResult}; @@ -1563,11 +1562,20 @@ pub(crate) mod tests { pub fn delete_document(&self, external_document_id: &str) { let mut wtxn = self.write_txn().unwrap(); - let mut delete = DeleteDocuments::new(&mut wtxn, self).unwrap(); - delete.strategy(self.index_documents_config.deletion_strategy); + let builder = IndexDocuments::new( + &mut wtxn, + self, + &self.indexer_config, + self.index_documents_config.clone(), + |_| (), + || false, + ) + .unwrap(); + let (builder, user_error) = + builder.remove_documents(vec![external_document_id.to_owned()]).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); - delete.delete_external_id(external_document_id); - delete.execute().unwrap(); wtxn.commit().unwrap(); } } @@ -1884,7 +1892,6 @@ pub(crate) mod tests { use maplit::hashset; let mut index = TempIndex::new(); - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; let index = index; index @@ -2055,8 +2062,6 @@ pub(crate) mod tests { } // Second Batch: replace the documents with soft-deletion { - index.index_documents_config.deletion_strategy = - crate::update::DeletionStrategy::AlwaysSoft; let mut docs1 = vec![]; for i in 0..3 { docs1.push(serde_json::json!( @@ -2125,7 +2130,6 @@ pub(crate) mod tests { drop(rtxn); // Third Batch: replace the documents with soft-deletion again { - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; let mut docs1 = vec![]; for i in 0..3 { docs1.push(serde_json::json!( @@ -2194,7 +2198,6 @@ pub(crate) mod tests { // Fourth Batch: replace the documents without soft-deletion { - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard; let mut docs1 = vec![]; for i in 0..3 { docs1.push(serde_json::json!( @@ -2266,7 +2269,6 @@ pub(crate) mod tests { fn bug_3021_first() { // https://github.com/meilisearch/meilisearch/issues/3021 let mut index = TempIndex::new(); - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; index @@ -2379,7 +2381,6 @@ pub(crate) mod tests { fn bug_3021_second() { // https://github.com/meilisearch/meilisearch/issues/3021 let mut index = TempIndex::new(); - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments; index @@ -2505,7 +2506,6 @@ pub(crate) mod tests { fn bug_3021_third() { // https://github.com/meilisearch/meilisearch/issues/3021 let mut index = TempIndex::new(); - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments; index @@ -2544,8 +2544,6 @@ pub(crate) mod tests { "###); db_snap!(index, soft_deleted_documents_ids, 2, @"[0, ]"); - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard; - index.add_documents(documents!([{ "primary_key": "4", "a": 2 }])).unwrap(); db_snap!(index, documents_ids, @"[2, 3, ]"); @@ -2579,7 +2577,6 @@ pub(crate) mod tests { // https://github.com/meilisearch/meilisearch/issues/3021 let mut index = TempIndex::new(); index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments; - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; index .update_settings(|settings| { @@ -2622,7 +2619,6 @@ pub(crate) mod tests { let mut wtxn = index.write_txn().unwrap(); let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - delete.strategy(DeletionStrategy::AlwaysHard); delete.execute().unwrap(); wtxn.commit().unwrap(); diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index f932d5aee..71e434599 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -563,14 +563,11 @@ mod tests { use crate::db_snap; use crate::documents::documents_batch_reader_from_objects; use crate::index::tests::TempIndex; - use crate::update::DeletionStrategy; #[test] fn replace_all_identical_soft_deletion_then_hard_deletion() { let mut index = TempIndex::new_with_map_size(4096 * 1000 * 100); - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; - index .update_settings(|settings| { settings.set_primary_key("id".to_owned()); @@ -622,7 +619,6 @@ mod tests { db_snap!(index, soft_deleted_documents_ids, "replaced_1_soft", @"6c975deb900f286d2f6456d2d5c3a123"); // Then replace the last document while disabling soft_deletion - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard; let mut documents = vec![]; for i in 999..1000 { documents.push( diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 7a77f3a96..0b000da06 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -35,8 +35,8 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ - DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, - WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst, + IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, WordPrefixDocids, + WordPrefixIntegerDocids, WordsPrefixesFst, }; use crate::{CboRoaringBitmapCodec, Index, Result}; @@ -89,7 +89,6 @@ pub struct IndexDocumentsConfig { pub words_positions_level_group_size: Option, pub words_positions_min_level_size: Option, pub update_method: IndexDocumentsMethod, - pub deletion_strategy: DeletionStrategy, pub autogenerate_docids: bool, } @@ -2497,7 +2496,6 @@ mod tests { // Delete not all of the documents but some of them. let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.strategy(DeletionStrategy::AlwaysHard); builder.delete_external_id("0"); builder.delete_external_id("3"); let result = builder.execute().unwrap(); @@ -2559,7 +2557,6 @@ mod tests { ] */ let mut index = TempIndex::new(); - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard; // START OF BATCH diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 97d802d03..dd8851ccb 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -1,6 +1,5 @@ pub use self::available_documents_ids::AvailableDocumentsIds; pub use self::clear_documents::ClearDocuments; -pub use self::delete_documents::{DeleteDocuments, DeletionStrategy, DocumentDeletionResult}; pub use self::facet::bulk::FacetsUpdateBulk; pub use self::facet::incremental::FacetsUpdateIncrementalInner; pub use self::index_documents::{ diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs index e3135d546..7d77490bc 100644 --- a/milli/src/update/prefix_word_pairs/mod.rs +++ b/milli/src/update/prefix_word_pairs/mod.rs @@ -149,7 +149,7 @@ mod tests { use crate::db_snap; use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use crate::index::tests::TempIndex; - use crate::update::{DeleteDocuments, DeletionStrategy, IndexDocumentsMethod}; + use crate::update::IndexDocumentsMethod; fn documents_with_enough_different_words_for_prefixes( prefixes: &[&str], @@ -337,7 +337,6 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - delete.strategy(DeletionStrategy::AlwaysHard); delete.delete_documents(&RoaringBitmap::from_iter([50])); delete.execute().unwrap(); wtxn.commit().unwrap(); @@ -349,7 +348,6 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - delete.strategy(DeletionStrategy::AlwaysHard); delete.delete_documents(&RoaringBitmap::from_iter(0..50)); delete.execute().unwrap(); wtxn.commit().unwrap(); @@ -421,7 +419,6 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - delete.strategy(DeletionStrategy::AlwaysSoft); delete.delete_documents(&RoaringBitmap::from_iter([50])); delete.execute().unwrap(); wtxn.commit().unwrap(); @@ -433,7 +430,6 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - delete.strategy(DeletionStrategy::AlwaysSoft); delete.delete_documents(&RoaringBitmap::from_iter(0..50)); delete.execute().unwrap(); @@ -460,7 +456,6 @@ mod tests { let mut index = TempIndex::new(); index.index_documents_config.words_prefix_threshold = Some(50); index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; index .update_settings(|settings| { @@ -520,7 +515,6 @@ mod tests { fn replace_hard_deletion() { let mut index = TempIndex::new(); index.index_documents_config.words_prefix_threshold = Some(50); - index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard; index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; index From c534a1b68764005018fceb767ec737a4dcc21784 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 25 Oct 2023 13:41:11 +0200 Subject: [PATCH 055/127] Stop using delete documents pipeline in batch runner --- index-scheduler/src/batch.rs | 68 ++++++++++++++----------- milli/src/update/index_documents/mod.rs | 2 + 2 files changed, 41 insertions(+), 29 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 3e2cc4281..a4b7e5c45 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -30,8 +30,7 @@ use meilisearch_types::heed::{RoTxn, RwTxn}; use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader}; use meilisearch_types::milli::heed::CompactionOption; use meilisearch_types::milli::update::{ - DeleteDocuments, DocumentDeletionResult, IndexDocumentsConfig, IndexDocumentsMethod, - Settings as MilliSettings, + IndexDocumentsConfig, IndexDocumentsMethod, Settings as MilliSettings, }; use meilisearch_types::milli::{self, Filter, BEU32}; use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked}; @@ -1238,7 +1237,8 @@ impl IndexScheduler { let (new_builder, user_result) = builder.remove_documents(document_ids)?; builder = new_builder; - + // Uses Invariant: remove documents actually always returns Ok for the inner result + let count = user_result.unwrap(); let provided_ids = if let Some(Details::DocumentDeletion { provided_ids, .. }) = task.details @@ -1249,23 +1249,11 @@ impl IndexScheduler { unreachable!(); }; - match user_result { - Ok(count) => { - task.status = Status::Succeeded; - task.details = Some(Details::DocumentDeletion { - provided_ids, - deleted_documents: Some(count), - }); - } - Err(e) => { - task.status = Status::Failed; - task.details = Some(Details::DocumentDeletion { - provided_ids, - deleted_documents: Some(0), - }); - task.error = Some(milli::Error::from(e).into()); - } - } + task.status = Status::Succeeded; + task.details = Some(Details::DocumentDeletion { + provided_ids, + deleted_documents: Some(count), + }); } } } @@ -1288,21 +1276,42 @@ impl IndexScheduler { Ok(tasks) } IndexOperation::DocumentDeletion { index_uid: _, documents, mut tasks } => { - let mut builder = milli::update::DeleteDocuments::new(index_wtxn, index)?; - documents.iter().flatten().for_each(|id| { - builder.delete_external_id(id); - }); + let indexer_config = self.index_mapper.indexer_config(); + let config = IndexDocumentsConfig { + update_method: IndexDocumentsMethod::ReplaceDocuments, + ..Default::default() + }; + let must_stop_processing = self.must_stop_processing.clone(); - let DocumentDeletionResult { deleted_documents, .. } = builder.execute()?; + let mut builder = milli::update::IndexDocuments::new( + index_wtxn, + index, + indexer_config, + config, + |indexing_step| debug!("update: {:?}", indexing_step), + || must_stop_processing.get(), + )?; + + let document_ids = documents.iter().cloned().flatten().collect(); + + let (new_builder, user_result) = builder.remove_documents(document_ids)?; + builder = new_builder; + // Uses Invariant: remove documents actually always returns Ok for the inner result + let count = user_result.unwrap(); for (task, documents) in tasks.iter_mut().zip(documents) { task.status = Status::Succeeded; task.details = Some(Details::DocumentDeletion { provided_ids: documents.len(), - deleted_documents: Some(deleted_documents.min(documents.len() as u64)), + deleted_documents: Some(count.min(documents.len() as u64)), }); } + if !tasks.iter().all(|res| res.error.is_some()) { + let addition = builder.execute()?; + info!("document deletion done: {:?}", addition); + } + Ok(tasks) } IndexOperation::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => { @@ -1558,9 +1567,10 @@ fn delete_document_by_filter<'a>( } e => e.into(), })?; - let mut delete_operation = DeleteDocuments::new(wtxn, index)?; - delete_operation.delete_documents(&candidates); - delete_operation.execute().map(|result| result.deleted_documents)? + todo!("need a way to get back the external ids from the internal ids"); + // let mut delete_operation = DeleteDocuments::new(wtxn, index)?; + // delete_operation.delete_documents(&candidates); + // delete_operation.execute().map(|result| result.deleted_documents)? } else { 0 }) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 0b000da06..c8481bd48 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -180,6 +180,7 @@ where // Early return when there is no document to add if to_delete.is_empty() { + // Maintains Invariant: remove documents actually always returns Ok for the inner result return Ok((self, Ok(0))); } @@ -192,6 +193,7 @@ where self.deleted_documents += deleted_documents; + // Maintains Invariant: remove documents actually always returns Ok for the inner result Ok((self, Ok(deleted_documents))) } From 113527f4660b8c062beae43eace63cb16a9d2bd5 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 25 Oct 2023 14:14:15 +0200 Subject: [PATCH 056/127] Remove soft-deleted related methods from Index --- meilisearch-types/src/error.rs | 1 - milli/src/error.rs | 2 - milli/src/index.rs | 67 +------------------ milli/src/search/facet/filter.rs | 3 - milli/src/snapshot_tests.rs | 10 --- milli/src/update/available_documents_ids.rs | 35 ++-------- milli/src/update/clear_documents.rs | 1 - milli/src/update/facet/mod.rs | 3 - milli/src/update/index_documents/mod.rs | 3 - milli/src/update/index_documents/transform.rs | 6 +- milli/src/update/prefix_word_pairs/mod.rs | 2 - 11 files changed, 9 insertions(+), 124 deletions(-) diff --git a/meilisearch-types/src/error.rs b/meilisearch-types/src/error.rs index 4b6711601..afe9c5189 100644 --- a/meilisearch-types/src/error.rs +++ b/meilisearch-types/src/error.rs @@ -324,7 +324,6 @@ impl ErrorCode for milli::Error { UserError::SerdeJson(_) | UserError::InvalidLmdbOpenOptions | UserError::DocumentLimitReached - | UserError::AccessingSoftDeletedDocument { .. } | UserError::UnknownInternalDocumentId { .. } => Code::Internal, UserError::InvalidStoreFile => Code::InvalidStoreFile, UserError::NoSpaceLeftOnDevice => Code::NoSpaceLeftOnDevice, diff --git a/milli/src/error.rs b/milli/src/error.rs index e9e1fddd3..b249f2977 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -89,8 +89,6 @@ pub enum FieldIdMapMissingEntry { #[derive(Error, Debug)] pub enum UserError { - #[error("A soft deleted internal document id have been used: `{document_id}`.")] - AccessingSoftDeletedDocument { document_id: DocumentId }, #[error("A document cannot contain more than 65,535 fields.")] AttributeLimitReached, #[error(transparent)] diff --git a/milli/src/index.rs b/milli/src/index.rs index 3e48f5eb1..b20674d4c 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -40,7 +40,6 @@ pub mod main_key { pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key"; pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; - pub const SOFT_DELETED_DOCUMENTS_IDS_KEY: &str = "soft-deleted-documents-ids"; pub const HIDDEN_FACETED_FIELDS_KEY: &str = "hidden-faceted-fields"; pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields"; pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields"; @@ -367,29 +366,6 @@ impl Index { Ok(count.unwrap_or_default()) } - /* deleted documents ids */ - - /// Writes the soft deleted documents ids. - pub(crate) fn put_soft_deleted_documents_ids( - &self, - wtxn: &mut RwTxn, - docids: &RoaringBitmap, - ) -> heed::Result<()> { - self.main.put::<_, Str, RoaringBitmapCodec>( - wtxn, - main_key::SOFT_DELETED_DOCUMENTS_IDS_KEY, - docids, - ) - } - - /// Returns the soft deleted documents ids. - pub(crate) fn soft_deleted_documents_ids(&self, rtxn: &RoTxn) -> heed::Result { - Ok(self - .main - .get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::SOFT_DELETED_DOCUMENTS_IDS_KEY)? - .unwrap_or_default()) - } - /* primary key */ /// Writes the documents primary key, this is the field name that is used to store the id. @@ -1187,12 +1163,7 @@ impl Index { rtxn: &'t RoTxn, ids: impl IntoIterator + 'a, ) -> Result)>> + 'a> { - let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?; - Ok(ids.into_iter().map(move |id| { - if soft_deleted_documents.contains(id) { - return Err(UserError::AccessingSoftDeletedDocument { document_id: id })?; - } let kv = self .documents .get(rtxn, &BEU32::new(id))? @@ -1418,14 +1389,10 @@ impl Index { rtxn: &RoTxn, key: &(Script, Language), ) -> heed::Result> { - let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?; - let doc_ids = self.script_language_docids.get(rtxn, key)?; - Ok(doc_ids.map(|ids| ids - soft_deleted_documents)) + Ok(self.script_language_docids.get(rtxn, key)?) } pub fn script_language(&self, rtxn: &RoTxn) -> heed::Result>> { - let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?; - let mut script_language: HashMap> = HashMap::new(); let mut script_language_doc_count: Vec<(Script, Language, u64)> = Vec::new(); let mut total = 0; @@ -1433,7 +1400,7 @@ impl Index { let ((script, language), docids) = sl?; // keep only Languages that contains at least 1 document. - let remaining_documents_count = (docids - &soft_deleted_documents).len(); + let remaining_documents_count = docids.len(); total += remaining_documents_count; if remaining_documents_count > 0 { script_language_doc_count.push((script, language, remaining_documents_count)); @@ -1918,7 +1885,6 @@ pub(crate) mod tests { 2 2 3 3 "###); - db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); db_snap!(index, facet_id_f64_docids, 1, @r###" 1 0 0 1 [0, ] 1 0 1 1 [1, ] @@ -1943,7 +1909,6 @@ pub(crate) mod tests { 2 6 3 3 "###); - db_snap!(index, soft_deleted_documents_ids, 2, @"[0, 1, 2, ]"); db_snap!(index, facet_id_f64_docids, 2, @r###" 1 0 0 1 [0, ] 1 0 1 1 [1, 4, ] @@ -1965,7 +1930,6 @@ pub(crate) mod tests { 2 6 3 3 "###); - db_snap!(index, soft_deleted_documents_ids, 3, @"[0, 1, 2, 3, ]"); db_snap!(index, facet_id_f64_docids, 3, @r###" 1 0 0 1 [0, ] 1 0 1 1 [1, 4, ] @@ -1989,7 +1953,6 @@ pub(crate) mod tests { 2 6 3 7 "###); - db_snap!(index, soft_deleted_documents_ids, 3, @"[]"); db_snap!(index, facet_id_f64_docids, 3, @r###" 0 0 0 1 [4, ] 0 0 1 1 [5, ] @@ -2052,7 +2015,6 @@ pub(crate) mod tests { 2 2 3 3 "###); - db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); db_snap!(index, facet_id_f64_docids, 1, @r###" 1 0 0 1 [0, ] 1 0 1 1 [1, ] @@ -2085,7 +2047,6 @@ pub(crate) mod tests { 2 6 3 3 "###); - db_snap!(index, soft_deleted_documents_ids, 1, @"[0, 1, 2, ]"); db_snap!(index, facet_id_f64_docids, 1, @r###" 1 0 0 1 [0, 4, ] 1 0 1 1 [1, 5, ] @@ -2153,7 +2114,6 @@ pub(crate) mod tests { 2 9 3 3 "###); - db_snap!(index, soft_deleted_documents_ids, 1, @"[0, 1, 2, 4, 5, 6, ]"); db_snap!(index, facet_id_f64_docids, 1, @r###" 1 0 0 1 [0, 4, 7, ] 1 0 1 1 [1, 5, 8, ] @@ -2221,7 +2181,7 @@ pub(crate) mod tests { 2 12 3 3 "###); - db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); + db_snap!(index, facet_id_f64_docids, 1, @r###" 1 0 0 1 [10, ] 1 0 3 1 [3, 11, ] @@ -2291,7 +2251,6 @@ pub(crate) mod tests { 34 1 38 0 "###); - db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); index.delete_document("34"); @@ -2302,7 +2261,6 @@ pub(crate) mod tests { 34 1 38 0 "###); - db_snap!(index, soft_deleted_documents_ids, 2, @"[1, ]"); index .update_settings(|s| { @@ -2318,7 +2276,6 @@ pub(crate) mod tests { hard: 38 0 "###); - db_snap!(index, soft_deleted_documents_ids, 3, @"[]"); // So that this document addition works correctly now. // It would be wrongly interpreted as a replacement before @@ -2331,7 +2288,6 @@ pub(crate) mod tests { 34 1 38 0 "###); - db_snap!(index, soft_deleted_documents_ids, 4, @"[]"); // We do the test again, but deleting the document with id 0 instead of id 1 now index.delete_document("38"); @@ -2343,7 +2299,6 @@ pub(crate) mod tests { 34 1 38 0 "###); - db_snap!(index, soft_deleted_documents_ids, 5, @"[0, ]"); index .update_settings(|s| { @@ -2357,7 +2312,6 @@ pub(crate) mod tests { hard: 34 1 "###); - db_snap!(index, soft_deleted_documents_ids, 6, @"[]"); // And adding lots of documents afterwards instead of just one. // These extra subtests don't add much, but it's better than nothing. @@ -2374,7 +2328,6 @@ pub(crate) mod tests { 41 3 42 5 "###); - db_snap!(index, soft_deleted_documents_ids, 7, @"[]"); } #[test] @@ -2403,7 +2356,6 @@ pub(crate) mod tests { 30 0 34 1 "###); - db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); index.delete_document("34"); @@ -2414,7 +2366,6 @@ pub(crate) mod tests { 30 0 34 1 "###); - db_snap!(index, soft_deleted_documents_ids, 2, @"[1, ]"); index .update_settings(|s| { @@ -2430,7 +2381,6 @@ pub(crate) mod tests { hard: 30 0 "###); - db_snap!(index, soft_deleted_documents_ids, 3, @"[]"); // So that when we add a new document index.add_documents(documents!({ "primary_key": 35, "b": 2 })).unwrap(); @@ -2444,7 +2394,6 @@ pub(crate) mod tests { 30 0 35 1 "###); - db_snap!(index, soft_deleted_documents_ids, 4, @"[]"); // And when we add 34 again, we don't replace document 35 index.add_documents(documents!({ "primary_key": 34, "a": 1 })).unwrap(); @@ -2458,7 +2407,6 @@ pub(crate) mod tests { 34 2 35 1 "###); - db_snap!(index, soft_deleted_documents_ids, 5, @"[]"); let rtxn = index.read_txn().unwrap(); let (_docid, obkv) = index.documents(&rtxn, [0]).unwrap()[0]; @@ -2499,7 +2447,6 @@ pub(crate) mod tests { 38 4 39 5 "###); - db_snap!(index, soft_deleted_documents_ids, 6, @"[]"); } #[test] @@ -2530,7 +2477,6 @@ pub(crate) mod tests { 4 1 5 2 "###); - db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); index.delete_document("3"); @@ -2542,7 +2488,6 @@ pub(crate) mod tests { 4 1 5 2 "###); - db_snap!(index, soft_deleted_documents_ids, 2, @"[0, ]"); index.add_documents(documents!([{ "primary_key": "4", "a": 2 }])).unwrap(); @@ -2553,7 +2498,6 @@ pub(crate) mod tests { 4 3 5 2 "###); - db_snap!(index, soft_deleted_documents_ids, 2, @"[]"); index .add_documents(documents!([ @@ -2569,7 +2513,6 @@ pub(crate) mod tests { 4 3 5 2 "###); - db_snap!(index, soft_deleted_documents_ids, 2, @"[]"); } #[test] @@ -2598,7 +2541,6 @@ pub(crate) mod tests { 11 0 4 1 "###); - db_snap!(index, soft_deleted_documents_ids, @"[]"); index .add_documents(documents!([ @@ -2615,7 +2557,6 @@ pub(crate) mod tests { 11 0 4 2 "###); - db_snap!(index, soft_deleted_documents_ids, @"[1, ]"); let mut wtxn = index.write_txn().unwrap(); let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); @@ -2630,7 +2571,6 @@ pub(crate) mod tests { 11 0 4 2 "###); - db_snap!(index, soft_deleted_documents_ids, @"[]"); index .add_documents(documents!([ @@ -2647,7 +2587,6 @@ pub(crate) mod tests { 11 0 4 1 "###); - db_snap!(index, soft_deleted_documents_ids, @"[2, 3, ]"); let rtxn = index.read_txn().unwrap(); let search = Search::new(&rtxn, &index); diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index fac7b68ea..4d9bbc183 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -223,12 +223,9 @@ impl<'a> Filter<'a> { impl<'a> Filter<'a> { pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result { // to avoid doing this for each recursive call we're going to do it ONCE ahead of time - let soft_deleted_documents = index.soft_deleted_documents_ids(rtxn)?; let filterable_fields = index.filterable_fields(rtxn)?; - // and finally we delete all the soft_deleted_documents, again, only once at the very end self.inner_evaluate(rtxn, index, &filterable_fields) - .map(|result| result - soft_deleted_documents) } fn evaluate_operator( diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index 77d9f41ec..c22038f81 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -98,7 +98,6 @@ Create a snapshot test of the given database. - `facet_id_string_docids` - `documents_ids` - `stop_words` - - `soft_deleted_documents_ids` - `field_distribution` - `fields_ids_map` - `geo_faceted_documents_ids` @@ -308,12 +307,6 @@ pub fn snap_stop_words(index: &Index) -> String { let snap = format!("{stop_words:?}"); snap } -pub fn snap_soft_deleted_documents_ids(index: &Index) -> String { - let rtxn = index.read_txn().unwrap(); - let soft_deleted_documents_ids = index.soft_deleted_documents_ids(&rtxn).unwrap(); - - display_bitmap(&soft_deleted_documents_ids) -} pub fn snap_field_distributions(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); let mut snap = String::new(); @@ -484,9 +477,6 @@ macro_rules! full_snap_of_db { ($index:ident, stop_words) => {{ $crate::snapshot_tests::snap_stop_words(&$index) }}; - ($index:ident, soft_deleted_documents_ids) => {{ - $crate::snapshot_tests::snap_soft_deleted_documents_ids(&$index) - }}; ($index:ident, field_distribution) => {{ $crate::snapshot_tests::snap_field_distributions(&$index) }}; diff --git a/milli/src/update/available_documents_ids.rs b/milli/src/update/available_documents_ids.rs index 784bee5a7..f460693ba 100644 --- a/milli/src/update/available_documents_ids.rs +++ b/milli/src/update/available_documents_ids.rs @@ -8,16 +8,11 @@ pub struct AvailableDocumentsIds { } impl AvailableDocumentsIds { - pub fn from_documents_ids( - docids: &RoaringBitmap, - soft_deleted_docids: &RoaringBitmap, - ) -> AvailableDocumentsIds { - let used_docids = docids | soft_deleted_docids; - - match used_docids.max() { + pub fn from_documents_ids(docids: &RoaringBitmap) -> AvailableDocumentsIds { + match docids.max() { Some(last_id) => { let mut available = RoaringBitmap::from_iter(0..last_id); - available -= used_docids; + available -= docids; let iter = match last_id.checked_add(1) { Some(id) => id..=u32::max_value(), @@ -50,7 +45,7 @@ mod tests { #[test] fn empty() { let base = RoaringBitmap::new(); - let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new()); + let left = AvailableDocumentsIds::from_documents_ids(&base); let right = 0..=u32::max_value(); left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r)); } @@ -63,28 +58,8 @@ mod tests { base.insert(100); base.insert(405); - let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new()); + let left = AvailableDocumentsIds::from_documents_ids(&base); let right = (0..=u32::max_value()).filter(|&n| n != 0 && n != 10 && n != 100 && n != 405); left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r)); } - - #[test] - fn soft_deleted() { - let mut base = RoaringBitmap::new(); - base.insert(0); - base.insert(10); - base.insert(100); - base.insert(405); - - let mut soft_deleted = RoaringBitmap::new(); - soft_deleted.insert(1); - soft_deleted.insert(11); - soft_deleted.insert(101); - soft_deleted.insert(406); - - let left = AvailableDocumentsIds::from_documents_ids(&base, &soft_deleted); - let right = - (0..=u32::max_value()).filter(|&n| ![0, 1, 10, 11, 100, 101, 405, 406].contains(&n)); - left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r)); - } } diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 3eb7e0910..ca5f69808 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -56,7 +56,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?; self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?; self.index.put_documents_ids(self.wtxn, &empty_roaring)?; - self.index.put_soft_deleted_documents_ids(self.wtxn, &empty_roaring)?; self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?; self.index.delete_geo_rtree(self.wtxn)?; self.index.delete_geo_faceted_documents_ids(self.wtxn)?; diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 71e434599..70a5e24c8 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -594,7 +594,6 @@ mod tests { index.add_documents(documents).unwrap(); db_snap!(index, facet_id_f64_docids, "initial", @"777e0e221d778764b472c512617eeb3b"); - db_snap!(index, soft_deleted_documents_ids, "initial", @"[]"); let mut documents = vec![]; for i in 0..999 { @@ -616,7 +615,6 @@ mod tests { index.add_documents(documents).unwrap(); db_snap!(index, facet_id_f64_docids, "replaced_1_soft", @"abba175d7bed727d0efadaef85a4388f"); - db_snap!(index, soft_deleted_documents_ids, "replaced_1_soft", @"6c975deb900f286d2f6456d2d5c3a123"); // Then replace the last document while disabling soft_deletion let mut documents = vec![]; @@ -639,7 +637,6 @@ mod tests { index.add_documents(documents).unwrap(); db_snap!(index, facet_id_f64_docids, "replaced_2_hard", @"029e27a46d09c574ae949aa4289b45e6"); - db_snap!(index, soft_deleted_documents_ids, "replaced_2_hard", @"[]"); } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index c8481bd48..864e13d04 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -999,7 +999,6 @@ mod tests { assert_eq!(count, 6); db_snap!(index, word_docids, "updated"); - db_snap!(index, soft_deleted_documents_ids, "updated", @"[0, 1, 4, ]"); drop(rtxn); } @@ -2649,8 +2648,6 @@ mod tests { 0 1 "###); - db_snap!(index, soft_deleted_documents_ids, @"[]"); - // BATCH 3 println!("--- ENTERING BATCH 3"); diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index e02da8cb5..872230d99 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -132,17 +132,13 @@ impl<'a, 'i> Transform<'a, 'i> { indexer_settings.max_memory.map(|mem| mem / 2), ); let documents_ids = index.documents_ids(wtxn)?; - let soft_deleted_documents_ids = index.soft_deleted_documents_ids(wtxn)?; Ok(Transform { index, fields_ids_map: index.fields_ids_map(wtxn)?, indexer_settings, autogenerate_docids, - available_documents_ids: AvailableDocumentsIds::from_documents_ids( - &documents_ids, - &soft_deleted_documents_ids, - ), + available_documents_ids: AvailableDocumentsIds::from_documents_ids(&documents_ids), original_sorter, flattened_sorter, index_documents_method, diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs index 7d77490bc..d6aa8e5a3 100644 --- a/milli/src/update/prefix_word_pairs/mod.rs +++ b/milli/src/update/prefix_word_pairs/mod.rs @@ -508,7 +508,6 @@ mod tests { db_snap!(index, word_docids, "replaced"); db_snap!(index, word_prefix_pair_proximity_docids, "replaced"); db_snap!(index, prefix_word_pair_proximity_docids, "replaced"); - db_snap!(index, soft_deleted_documents_ids, "replaced", @"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, ]"); } #[test] @@ -568,6 +567,5 @@ mod tests { db_snap!(index, word_docids, "replaced"); db_snap!(index, word_prefix_pair_proximity_docids, "replaced"); db_snap!(index, prefix_word_pair_proximity_docids, "replaced"); - db_snap!(index, soft_deleted_documents_ids, "replaced", @"[]"); } } From fa6c7f65cae9fcb232bcc671c82421304da47d25 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 25 Oct 2023 14:42:09 +0200 Subject: [PATCH 057/127] Add TmpIndex::delete_documents --- milli/src/index.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index b20674d4c..64aff636b 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1526,7 +1526,7 @@ pub(crate) mod tests { Ok(()) } - pub fn delete_document(&self, external_document_id: &str) { + pub fn delete_documents(&self, external_document_ids: Vec) { let mut wtxn = self.write_txn().unwrap(); let builder = IndexDocuments::new( @@ -1538,13 +1538,16 @@ pub(crate) mod tests { || false, ) .unwrap(); - let (builder, user_error) = - builder.remove_documents(vec![external_document_id.to_owned()]).unwrap(); + let (builder, user_error) = builder.remove_documents(external_document_ids).unwrap(); user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); } + + pub fn delete_document(&self, external_document_id: &str) { + self.delete_documents(vec![external_document_id.to_string()]) + } } #[test] From 290e773d23a4c0108b9c9330ed2a5ca76028e973 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 25 Oct 2023 14:49:25 +0200 Subject: [PATCH 058/127] remove more warnings and fix some tests --- milli/src/index.rs | 10 +++----- milli/src/snapshot_tests.rs | 3 +-- milli/src/update/facet/mod.rs | 2 +- milli/src/update/index_documents/mod.rs | 25 ++++++-------------- milli/src/update/prefix_word_pairs/mod.rs | 28 ++++------------------- milli/src/update/settings.rs | 10 +++----- 6 files changed, 19 insertions(+), 59 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 64aff636b..a280a1a48 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1861,8 +1861,7 @@ pub(crate) mod tests { use big_s::S; use maplit::hashset; - let mut index = TempIndex::new(); - let index = index; + let index = TempIndex::new(); index .update_settings(|settings| { @@ -1973,7 +1972,7 @@ pub(crate) mod tests { use big_s::S; use maplit::hashset; - let mut index = TempIndex::new(); + let index = TempIndex::new(); index .update_settings(|settings| { @@ -2561,10 +2560,7 @@ pub(crate) mod tests { 4 2 "###); - let mut wtxn = index.write_txn().unwrap(); - let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - delete.execute().unwrap(); - wtxn.commit().unwrap(); + index.delete_documents(Default::default()); db_snap!(index, documents_ids, @"[0, 2, 3, ]"); db_snap!(index, external_documents_ids, @r###" diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index c22038f81..1d8d63277 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -4,9 +4,8 @@ use std::path::Path; use roaring::RoaringBitmap; -use crate::facet::FacetType; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue}; -use crate::{make_db_snap_from_iter, obkv_to_json, ExternalDocumentsIds, Index}; +use crate::{make_db_snap_from_iter, obkv_to_json, Index}; #[track_caller] pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> (insta::Settings, String) { diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 70a5e24c8..05e6a93d8 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -566,7 +566,7 @@ mod tests { #[test] fn replace_all_identical_soft_deletion_then_hard_deletion() { - let mut index = TempIndex::new_with_map_size(4096 * 1000 * 100); + let index = TempIndex::new_with_map_size(4096 * 1000 * 100); index .update_settings(|settings| { diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 864e13d04..c1e40373f 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -696,7 +696,6 @@ mod tests { use crate::documents::documents_batch_reader_from_objects; use crate::index::tests::TempIndex; use crate::search::TermsMatchingStrategy; - use crate::update::DeleteDocuments; use crate::{db_snap, BEU16}; #[test] @@ -1101,17 +1100,15 @@ mod tests { { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } ])) .unwrap(); - let mut wtxn = index.write_txn().unwrap(); - assert_eq!(index.primary_key(&wtxn).unwrap(), Some("objectId")); // Delete not all of the documents but some of them. - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.delete_external_id("30"); - builder.execute().unwrap(); + index.delete_document("30"); - let external_documents_ids = index.external_documents_ids(&wtxn).unwrap(); + let txn = index.read_txn().unwrap(); + assert_eq!(index.primary_key(&txn).unwrap(), Some("objectId")); + + let external_documents_ids = index.external_documents_ids(&txn).unwrap(); assert!(external_documents_ids.get("30").is_none()); - wtxn.commit().unwrap(); index .add_documents(documents!([ @@ -2493,16 +2490,8 @@ mod tests { db_snap!(index, word_fid_docids, 2, @"a48d3f88db33f94bc23110a673ea49e4"); db_snap!(index, word_position_docids, 2, @"3c9e66c6768ae2cf42b46b2c46e46a83"); - let mut wtxn = index.write_txn().unwrap(); - // Delete not all of the documents but some of them. - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.delete_external_id("0"); - builder.delete_external_id("3"); - let result = builder.execute().unwrap(); - println!("{result:?}"); - - wtxn.commit().unwrap(); + index.delete_documents(vec!["0".into(), "3".into()]); db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933"); db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f"); @@ -2557,7 +2546,7 @@ mod tests { ), ] */ - let mut index = TempIndex::new(); + let index = TempIndex::new(); // START OF BATCH diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs index d6aa8e5a3..1ec57e080 100644 --- a/milli/src/update/prefix_word_pairs/mod.rs +++ b/milli/src/update/prefix_word_pairs/mod.rs @@ -142,9 +142,6 @@ pub fn write_into_lmdb_database_without_merging( #[cfg(test)] mod tests { use std::io::Cursor; - use std::iter::FromIterator; - - use roaring::RoaringBitmap; use crate::db_snap; use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; @@ -335,22 +332,14 @@ mod tests { db_snap!(index, word_prefix_pair_proximity_docids, "initial"); db_snap!(index, prefix_word_pair_proximity_docids, "initial"); - let mut wtxn = index.write_txn().unwrap(); - let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - delete.delete_documents(&RoaringBitmap::from_iter([50])); - delete.execute().unwrap(); - wtxn.commit().unwrap(); + index.delete_document("9000"); db_snap!(index, documents_ids, "first_delete"); db_snap!(index, word_docids, "first_delete"); db_snap!(index, word_prefix_pair_proximity_docids, "first_delete"); db_snap!(index, prefix_word_pair_proximity_docids, "first_delete"); - let mut wtxn = index.write_txn().unwrap(); - let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - delete.delete_documents(&RoaringBitmap::from_iter(0..50)); - delete.execute().unwrap(); - wtxn.commit().unwrap(); + index.delete_documents((0..50).map(|id| id.to_string()).collect()); db_snap!(index, documents_ids, "second_delete"); db_snap!(index, word_docids, "second_delete"); @@ -417,23 +406,14 @@ mod tests { db_snap!(index, word_prefix_pair_proximity_docids, "initial"); db_snap!(index, prefix_word_pair_proximity_docids, "initial"); - let mut wtxn = index.write_txn().unwrap(); - let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - delete.delete_documents(&RoaringBitmap::from_iter([50])); - delete.execute().unwrap(); - wtxn.commit().unwrap(); + index.delete_document("9000"); db_snap!(index, documents_ids, "first_delete"); db_snap!(index, word_docids, "first_delete"); db_snap!(index, word_prefix_pair_proximity_docids, "first_delete"); db_snap!(index, prefix_word_pair_proximity_docids, "first_delete"); - let mut wtxn = index.write_txn().unwrap(); - let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - - delete.delete_documents(&RoaringBitmap::from_iter(0..50)); - delete.execute().unwrap(); - wtxn.commit().unwrap(); + index.delete_documents((0..50).map(|id| id.to_string()).collect()); db_snap!(index, documents_ids, "second_delete"); db_snap!(index, word_docids, "second_delete"); diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index c2c0e9084..fd7ffa760 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -923,7 +923,7 @@ mod tests { use super::*; use crate::error::Error; use crate::index::tests::TempIndex; - use crate::update::{ClearDocuments, DeleteDocuments}; + use crate::update::ClearDocuments; use crate::{Criterion, Filter, SearchResult}; #[test] @@ -1768,13 +1768,9 @@ mod tests { } index.add_documents(documents! { docs }).unwrap(); - let mut wtxn = index.write_txn().unwrap(); - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - (0..5).for_each(|id| { - builder.delete_external_id(&id.to_string()); - }); - builder.execute().unwrap(); + index.delete_documents((0..5).map(|id| id.to_string()).collect()); + let mut wtxn = index.write_txn().unwrap(); index .update_settings_using_wtxn(&mut wtxn, |settings| { settings.set_searchable_fields(vec!["id".to_string()]); From 73c06d31d973771d667ba40939db8f399061a5cb Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 25 Oct 2023 16:50:49 +0200 Subject: [PATCH 059/127] snapshot always display stuff in consistent order --- milli/src/snapshot_tests.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index 1d8d63277..730d0a5c8 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -333,6 +333,9 @@ pub fn snap_geo_faceted_documents_ids(index: &Index) -> String { pub fn snap_external_documents_ids(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); let external_ids = index.external_documents_ids(&rtxn).unwrap().to_hash_map(); + // ensure fixed order (not guaranteed by hashmap) + let mut external_ids: Vec<(String, u32)> = external_ids.into_iter().collect(); + external_ids.sort_by(|(l, _), (r, _)| l.cmp(r)); let mut snap = String::new(); From 3c158818187313eec79684ac498e43fd71e47409 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 25 Oct 2023 17:32:36 +0200 Subject: [PATCH 060/127] Add simple delete test --- milli/src/index.rs | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/milli/src/index.rs b/milli/src/index.rs index a280a1a48..481f698fc 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -2332,6 +2332,32 @@ pub(crate) mod tests { "###); } + #[test] + fn simple_delete() { + let mut index = TempIndex::new(); + index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments; + index + .add_documents(documents!([ + { "id": 30 }, + { "id": 34 } + ])) + .unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, ]"); + db_snap!(index, external_documents_ids, 1, @r###" + docids: + 30 0 + 34 1"###); + + index.delete_document("34"); + + db_snap!(index, documents_ids, @"[0, ]"); + db_snap!(index, external_documents_ids, 2, @r###" + docids: + 30 0 + "###); + } + #[test] fn bug_3021_second() { // https://github.com/meilisearch/meilisearch/issues/3021 From e78281785ca8568e4a55833ec1fa4139dc097611 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 25 Oct 2023 17:32:45 +0200 Subject: [PATCH 061/127] Actually execute the transform even if there are only documents to delete --- milli/src/update/index_documents/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index c1e40373f..ee1dea7d5 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -201,7 +201,7 @@ where pub fn execute(mut self) -> Result { puffin::profile_function!(); - if self.added_documents == 0 { + if self.added_documents == 0 && self.deleted_documents == 0 { let number_of_documents = self.index.number_of_documents(self.wtxn)?; return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents }); } From a35988550cc785b66368f0e7cc4904930486f793 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 25 Oct 2023 18:02:43 +0200 Subject: [PATCH 062/127] Fix some snapshots --- milli/src/index.rs | 144 +++++++++--------------- milli/src/update/index_documents/mod.rs | 6 +- 2 files changed, 58 insertions(+), 92 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 481f698fc..ba00111b3 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1880,8 +1880,7 @@ pub(crate) mod tests { db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]"); db_snap!(index, external_documents_ids, 1, @r###" - soft: - hard: + docids: 0 0 1 1 2 2 @@ -1902,13 +1901,12 @@ pub(crate) mod tests { } index.add_documents(documents!(docs)).unwrap(); - db_snap!(index, documents_ids, @"[3, 4, 5, 6, ]"); + db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]"); db_snap!(index, external_documents_ids, 2, @r###" - soft: - hard: - 0 4 - 1 5 - 2 6 + docids: + 0 0 + 1 1 + 2 2 3 3 "###); db_snap!(index, facet_id_f64_docids, 2, @r###" @@ -1922,14 +1920,12 @@ pub(crate) mod tests { .add_documents(documents!([{ "id": 3, "doggo": 4 }, { "id": 3, "doggo": 5 },{ "id": 3, "doggo": 4 }])) .unwrap(); - db_snap!(index, documents_ids, @"[4, 5, 6, 7, ]"); + db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]"); db_snap!(index, external_documents_ids, 3, @r###" - soft: - 3 7 - hard: - 0 4 - 1 5 - 2 6 + docids: + 0 0 + 1 1 + 2 2 3 3 "###); db_snap!(index, facet_id_f64_docids, 3, @r###" @@ -1946,14 +1942,13 @@ pub(crate) mod tests { }) .unwrap(); - db_snap!(index, documents_ids, @"[4, 5, 6, 7, ]"); + db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]"); db_snap!(index, external_documents_ids, 3, @r###" - soft: - hard: - 0 4 - 1 5 - 2 6 - 3 7 + docids: + 0 0 + 1 1 + 2 2 + 3 3 "###); db_snap!(index, facet_id_f64_docids, 3, @r###" 0 0 0 1 [4, ] @@ -2010,8 +2005,7 @@ pub(crate) mod tests { db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]"); db_snap!(index, external_documents_ids, 1, @r###" - soft: - hard: + docids: 0 0 1 1 2 2 @@ -2040,13 +2034,12 @@ pub(crate) mod tests { } add_documents(&index, vec![docs1, docs2]); - db_snap!(index, documents_ids, @"[3, 4, 5, 6, ]"); + db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]"); db_snap!(index, external_documents_ids, 1, @r###" - soft: - hard: - 0 4 - 1 5 - 2 6 + docids: + 0 0 + 1 1 + 2 2 3 3 "###); db_snap!(index, facet_id_f64_docids, 1, @r###" @@ -2248,8 +2241,7 @@ pub(crate) mod tests { db_snap!(index, documents_ids, @"[0, 1, ]"); db_snap!(index, external_documents_ids, 1, @r###" - soft: - hard: + docids: 34 1 38 0 "###); @@ -2258,9 +2250,7 @@ pub(crate) mod tests { db_snap!(index, documents_ids, @"[0, ]"); db_snap!(index, external_documents_ids, 2, @r###" - soft: - hard: - 34 1 + docids: 38 0 "###); @@ -2274,8 +2264,7 @@ pub(crate) mod tests { // do not contain any entry for previously soft-deleted document ids db_snap!(index, documents_ids, @"[0, ]"); db_snap!(index, external_documents_ids, 3, @r###" - soft: - hard: + docids: 38 0 "###); @@ -2285,8 +2274,7 @@ pub(crate) mod tests { db_snap!(index, documents_ids, @"[0, 1, ]"); db_snap!(index, external_documents_ids, 4, @r###" - soft: - hard: + docids: 34 1 38 0 "###); @@ -2296,10 +2284,8 @@ pub(crate) mod tests { db_snap!(index, documents_ids, @"[1, ]"); db_snap!(index, external_documents_ids, 5, @r###" - soft: - hard: + docids: 34 1 - 38 0 "###); index @@ -2310,8 +2296,7 @@ pub(crate) mod tests { db_snap!(index, documents_ids, @"[1, ]"); db_snap!(index, external_documents_ids, 6, @r###" - soft: - hard: + docids: 34 1 "###); @@ -2321,8 +2306,7 @@ pub(crate) mod tests { db_snap!(index, documents_ids, @"[0, 1, 2, 3, 4, 5, ]"); db_snap!(index, external_documents_ids, 7, @r###" - soft: - hard: + docids: 34 1 38 0 39 2 @@ -2379,8 +2363,7 @@ pub(crate) mod tests { db_snap!(index, documents_ids, @"[0, 1, ]"); db_snap!(index, external_documents_ids, 1, @r###" - soft: - hard: + docids: 30 0 34 1 "###); @@ -2389,10 +2372,8 @@ pub(crate) mod tests { db_snap!(index, documents_ids, @"[0, ]"); db_snap!(index, external_documents_ids, 2, @r###" - soft: - hard: + docids: 30 0 - 34 1 "###); index @@ -2405,8 +2386,7 @@ pub(crate) mod tests { // do not contain any entry for previously soft-deleted document ids db_snap!(index, documents_ids, @"[0, ]"); db_snap!(index, external_documents_ids, 3, @r###" - soft: - hard: + docids: 30 0 "###); @@ -2417,8 +2397,7 @@ pub(crate) mod tests { // The external documents ids don't have several external ids pointing to the same // internal document id db_snap!(index, external_documents_ids, 4, @r###" - soft: - hard: + docids: 30 0 35 1 "###); @@ -2429,8 +2408,7 @@ pub(crate) mod tests { // And document 35 still exists, is not deleted db_snap!(index, documents_ids, @"[0, 1, 2, ]"); db_snap!(index, external_documents_ids, 5, @r###" - soft: - hard: + docids: 30 0 34 2 35 1 @@ -2466,8 +2444,7 @@ pub(crate) mod tests { db_snap!(index, documents_ids, @"[0, 1, 2, 3, 4, 5, ]"); db_snap!(index, external_documents_ids, 6, @r###" - soft: - hard: + docids: 30 0 34 2 35 1 @@ -2499,8 +2476,7 @@ pub(crate) mod tests { db_snap!(index, documents_ids, @"[0, 1, 2, ]"); db_snap!(index, external_documents_ids, 1, @r###" - soft: - hard: + docids: 3 0 4 1 5 2 @@ -2510,20 +2486,17 @@ pub(crate) mod tests { db_snap!(index, documents_ids, @"[1, 2, ]"); db_snap!(index, external_documents_ids, 2, @r###" - soft: - hard: - 3 0 + docids: 4 1 5 2 "###); index.add_documents(documents!([{ "primary_key": "4", "a": 2 }])).unwrap(); - db_snap!(index, documents_ids, @"[2, 3, ]"); + db_snap!(index, documents_ids, @"[1, 2, ]"); db_snap!(index, external_documents_ids, 2, @r###" - soft: - hard: - 4 3 + docids: + 4 1 5 2 "###); @@ -2533,12 +2506,11 @@ pub(crate) mod tests { ])) .unwrap(); - db_snap!(index, documents_ids, @"[0, 2, 3, ]"); + db_snap!(index, documents_ids, @"[0, 1, 2, ]"); db_snap!(index, external_documents_ids, 2, @r###" - soft: - hard: + docids: 3 0 - 4 3 + 4 1 5 2 "###); } @@ -2564,8 +2536,7 @@ pub(crate) mod tests { db_snap!(index, documents_ids, @"[0, 1, ]"); db_snap!(index, external_documents_ids, @r###" - soft: - hard: + docids: 11 0 4 1 "###); @@ -2577,24 +2548,22 @@ pub(crate) mod tests { ])) .unwrap(); - db_snap!(index, documents_ids, @"[0, 2, 3, ]"); + db_snap!(index, documents_ids, @"[0, 1, 2, ]"); db_snap!(index, external_documents_ids, @r###" - soft: - hard: - 1 3 + docids: + 1 2 11 0 - 4 2 + 4 1 "###); index.delete_documents(Default::default()); - db_snap!(index, documents_ids, @"[0, 2, 3, ]"); + db_snap!(index, documents_ids, @"[0, 1, 2, ]"); db_snap!(index, external_documents_ids, @r###" - soft: - hard: - 1 3 + docids: + 1 2 11 0 - 4 2 + 4 1 "###); index @@ -2604,11 +2573,10 @@ pub(crate) mod tests { ])) .unwrap(); - db_snap!(index, documents_ids, @"[0, 1, 4, ]"); + db_snap!(index, documents_ids, @"[0, 1, 2, ]"); db_snap!(index, external_documents_ids, @r###" - soft: - hard: - 1 4 + docids: + 1 2 11 0 4 1 "###); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index ee1dea7d5..5f5c418d9 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -2586,8 +2586,7 @@ mod tests { {"id":1,"doggo":"bernese"} "###); db_snap!(index, external_documents_ids, @r###" - soft: - hard: + docids: 1 0 "###); @@ -2632,8 +2631,7 @@ mod tests { "###); db_snap!(index, external_documents_ids, @r###" - soft: - hard: + docids: 0 1 "###); From 9a2dccc3bcda089cdc596481d585b486f9b38729 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 26 Oct 2023 10:36:34 +0200 Subject: [PATCH 063/127] Add iterator to find external ids of a bitmap of internal ids --- milli/src/external_documents_ids.rs | 68 ++++++++++++++++++++++++++++- 1 file changed, 67 insertions(+), 1 deletion(-) diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs index 12db4eb1d..02794609f 100644 --- a/milli/src/external_documents_ids.rs +++ b/milli/src/external_documents_ids.rs @@ -4,6 +4,7 @@ use std::convert::TryInto; use std::fmt; use fst::Streamer; +use roaring::RoaringBitmap; use crate::DocumentId; @@ -55,7 +56,24 @@ impl<'a> ExternalDocumentsIds<'a> { self.0.as_fst().as_bytes() } - /// Apply the list of operations passed as argument, modifying the current external to internal id mapping. + /// Looks for the internal ids in the passed bitmap, and returns an iterator over the mapping between + /// these internal ids and their external id. + /// + /// The returned iterator has `Result<(String, DocumentId), RoaringBitmap>` as `Item`, + /// where the returned values can be: + /// - `Ok((external_id, internal_id))`: if a mapping was found + /// - `Err(remaining_ids)`: if the external ids for some of the requested internal ids weren't found. + /// In that case the returned bitmap contains the internal ids whose external ids were not found after traversing + /// the entire fst. + pub fn find_external_id_of( + &self, + internal_ids: RoaringBitmap, + ) -> ExternalToInternalOwnedIterator<'_> { + let it = ExternalToInternalOwnedIterator { stream: self.0.stream(), internal_ids }; + it + } + + /// Applies the list of operations passed as argument, modifying the current external to internal id mapping. /// /// If the list contains multiple operations on the same external id, then the result is unspecified. /// @@ -129,3 +147,51 @@ impl Default for ExternalDocumentsIds<'static> { ExternalDocumentsIds(fst::Map::default().map_data(Cow::Owned).unwrap()) } } + +/// An iterator over mappings between requested internal ids and external ids. +/// +/// See [`ExternalDocumentsIds::find_external_id_of`] for details. +pub struct ExternalToInternalOwnedIterator<'it> { + stream: fst::map::Stream<'it>, + internal_ids: RoaringBitmap, +} + +impl<'it> Iterator for ExternalToInternalOwnedIterator<'it> { + /// A result indicating if a mapping was found, or if the stream was exhausted without finding all internal ids. + type Item = Result<(String, DocumentId), RoaringBitmap>; + + fn next(&mut self) -> Option { + // if all requested ids were found, we won't find any other, so short-circuit + if self.internal_ids.is_empty() { + return None; + } + loop { + let Some((external, internal)) = self.stream.next() else { + // we exhausted the stream but we still have some internal ids to find + let remaining_ids = std::mem::take(&mut self.internal_ids); + return Some(Err(remaining_ids)); + // note: next calls to `next` will return `None` since we replaced the internal_ids + // with the default empty bitmap + }; + let internal = internal.try_into().unwrap(); + let was_contained = self.internal_ids.remove(internal); + if was_contained { + return Some(Ok((std::str::from_utf8(external).unwrap().to_owned(), internal))); + } + } + } +} + +impl<'it> ExternalToInternalOwnedIterator<'it> { + /// Returns the bitmap of internal ids whose external id are yet to be found + pub fn remaining_internal_ids(&self) -> &RoaringBitmap { + &self.internal_ids + } + + /// Consumes this iterator and returns an iterator over only the external ids, ignoring the internal ids. + /// + /// Use this when you don't need the mapping between the external and the internal ids. + pub fn only_external_ids(self) -> impl Iterator> + 'it { + self.map(|res| res.map(|(external, _internal)| external)) + } +} From 652ac3052d518433c69fab9b3cdfaccc0f6bed68 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 26 Oct 2023 10:54:20 +0200 Subject: [PATCH 064/127] use new iterator in batch --- index-scheduler/src/batch.rs | 50 +++++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index a4b7e5c45..c4f9c12be 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -30,7 +30,7 @@ use meilisearch_types::heed::{RoTxn, RwTxn}; use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader}; use meilisearch_types::milli::heed::CompactionOption; use meilisearch_types::milli::update::{ - IndexDocumentsConfig, IndexDocumentsMethod, Settings as MilliSettings, + IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings, }; use meilisearch_types::milli::{self, Filter, BEU32}; use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked}; @@ -43,7 +43,7 @@ use uuid::Uuid; use crate::autobatcher::{self, BatchKind}; use crate::utils::{self, swap_index_uid_in_task}; -use crate::{Error, IndexScheduler, ProcessingTasks, Result, TaskId}; +use crate::{Error, IndexScheduler, MustStopProcessing, ProcessingTasks, Result, TaskId}; /// Represents a combination of tasks that can all be processed at the same time. /// @@ -1323,7 +1323,13 @@ impl IndexScheduler { } else { unreachable!() }; - let deleted_documents = delete_document_by_filter(index_wtxn, filter, index); + let deleted_documents = delete_document_by_filter( + index_wtxn, + filter, + self.index_mapper.indexer_config(), + self.must_stop_processing.clone(), + index, + ); let original_filter = if let Some(Details::DocumentDeletionByFilter { original_filter, deleted_documents: _, @@ -1557,6 +1563,8 @@ impl IndexScheduler { fn delete_document_by_filter<'a>( wtxn: &mut RwTxn<'a, '_>, filter: &serde_json::Value, + indexer_config: &IndexerConfig, + must_stop_processing: MustStopProcessing, index: &'a Index, ) -> Result { let filter = Filter::from_json(filter)?; @@ -1567,10 +1575,38 @@ fn delete_document_by_filter<'a>( } e => e.into(), })?; - todo!("need a way to get back the external ids from the internal ids"); - // let mut delete_operation = DeleteDocuments::new(wtxn, index)?; - // delete_operation.delete_documents(&candidates); - // delete_operation.execute().map(|result| result.deleted_documents)? + let external_documents_ids = index.external_documents_ids(wtxn)?; + // FIXME: for filters matching a lot of documents, this will allocate a huge vec of external docids (strings). + // Since what we have is an iterator, it would be better to delete in chunks + let external_to_internal: std::result::Result, RoaringBitmap> = + external_documents_ids.find_external_id_of(candidates).only_external_ids().collect(); + let document_ids = match external_to_internal { + Ok(external_ids) => external_ids, + Err(remaining_ids) => panic!("Couldn't find some external ids {:?}", remaining_ids), + }; + + let config = IndexDocumentsConfig { + update_method: IndexDocumentsMethod::ReplaceDocuments, + ..Default::default() + }; + + let mut builder = milli::update::IndexDocuments::new( + wtxn, + index, + indexer_config, + config, + |indexing_step| debug!("update: {:?}", indexing_step), + || must_stop_processing.get(), + )?; + + let (new_builder, user_result) = builder.remove_documents(document_ids)?; + builder = new_builder; + // Uses Invariant: remove documents actually always returns Ok for the inner result + let count = user_result.unwrap(); + + let _ = builder.execute()?; + + count } else { 0 }) From ae4ec8ea55bc976cd3aacab90cb6d845642e40d3 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 26 Oct 2023 12:15:55 +0200 Subject: [PATCH 065/127] Add delete_document_using_wtxn to TempIndex --- milli/src/index.rs | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index ba00111b3..d99c36b65 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1526,11 +1526,13 @@ pub(crate) mod tests { Ok(()) } - pub fn delete_documents(&self, external_document_ids: Vec) { - let mut wtxn = self.write_txn().unwrap(); - + pub fn delete_documents_using_wtxn<'t>( + &'t self, + wtxn: &mut RwTxn<'t, '_>, + external_document_ids: Vec, + ) { let builder = IndexDocuments::new( - &mut wtxn, + wtxn, self, &self.indexer_config, self.index_documents_config.clone(), @@ -1541,6 +1543,12 @@ pub(crate) mod tests { let (builder, user_error) = builder.remove_documents(external_document_ids).unwrap(); user_error.unwrap(); builder.execute().unwrap(); + } + + pub fn delete_documents(&self, external_document_ids: Vec) { + let mut wtxn = self.write_txn().unwrap(); + + self.delete_documents_using_wtxn(&mut wtxn, external_document_ids); wtxn.commit().unwrap(); } From 8e0d9c9a5e89e0fc0612ba61af6f25fbc358e2b6 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 26 Oct 2023 12:16:16 +0200 Subject: [PATCH 066/127] Recover delete_documents tests that were too eagerly deleted --- milli/src/update/index_documents/mod.rs | 533 +++++++++++++++++++++++- 1 file changed, 532 insertions(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 5f5c418d9..b439ca409 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -690,13 +690,15 @@ fn execute_word_prefix_docids( #[cfg(test)] mod tests { use big_s::S; + use fst::IntoStreamer; + use heed::RwTxn; use maplit::hashset; use super::*; use crate::documents::documents_batch_reader_from_objects; use crate::index::tests::TempIndex; use crate::search::TermsMatchingStrategy; - use crate::{db_snap, BEU16}; + use crate::{db_snap, Filter, Search, BEU16}; #[test] fn simple_document_replacement() { @@ -2676,4 +2678,533 @@ mod tests { let res = index.search(&rtxn).execute().unwrap(); index.documents(&rtxn, res.documents_ids).unwrap(); } + + fn delete_documents<'t>( + wtxn: &mut RwTxn<'t, '_>, + index: &'t TempIndex, + external_ids: &[&str], + ) -> Vec { + let external_document_ids = index.external_documents_ids(wtxn).unwrap(); + let ids_to_delete: Vec = external_ids + .iter() + .map(|id| external_document_ids.get(id.as_bytes()).unwrap()) + .collect(); + + // Delete some documents. + index.delete_documents_using_wtxn( + wtxn, + external_ids.iter().map(ToString::to_string).collect(), + ); + + ids_to_delete + } + + #[test] + fn delete_documents_with_numbers_as_primary_key() { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } }, + { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] }, + { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] } + ]), + ) + .unwrap(); + + // delete those documents, ids are synchronous therefore 0, 1, and 2. + index.delete_documents_using_wtxn(&mut wtxn, vec![S("0"), S("1"), S("2")]); + + wtxn.commit().unwrap(); + + // All these snapshots should be empty since the database was cleared + db_snap!(index, documents_ids); + db_snap!(index, word_docids); + db_snap!(index, word_pair_proximity_docids); + db_snap!(index, facet_id_exists_docids); + + let rtxn = index.read_txn().unwrap(); + + assert!(index.field_distribution(&rtxn).unwrap().is_empty()); + } + + #[test] + fn delete_documents_with_strange_primary_key() { + let index = TempIndex::new(); + + index + .update_settings(|settings| settings.set_searchable_fields(vec!["name".to_string()])) + .unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "mysuperid": 0, "name": "kevin" }, + { "mysuperid": 1, "name": "kevina" }, + { "mysuperid": 2, "name": "benoit" } + ]), + ) + .unwrap(); + wtxn.commit().unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + + // Delete not all of the documents but some of them. + index.delete_documents_using_wtxn(&mut wtxn, vec![S("0"), S("1")]); + + wtxn.commit().unwrap(); + + db_snap!(index, documents_ids); + db_snap!(index, word_docids); + db_snap!(index, word_pair_proximity_docids); + } + + #[test] + fn filtered_placeholder_search_should_not_return_deleted_documents_() { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("docid")); + settings.set_filterable_fields(hashset! { S("label"), S("label2") }); + }) + .unwrap(); + + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "docid": "1_4", "label": ["sign"] }, + { "docid": "1_5", "label": ["letter"] }, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, + { "docid": "1_36", "label": ["drawing","painting","pattern"] }, + { "docid": "1_37", "label": ["art","drawing","outdoor"] }, + { "docid": "1_38", "label": ["aquarium","art","drawing"] }, + { "docid": "1_39", "label": ["abstract"] }, + { "docid": "1_40", "label": ["cartoon"] }, + { "docid": "1_41", "label": ["art","drawing"] }, + { "docid": "1_42", "label": ["art","pattern"] }, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, + { "docid": "1_44", "label": ["drawing"] }, + { "docid": "1_45", "label": ["art"] }, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, + { "docid": "1_47", "label": ["abstract","pattern"] }, + { "docid": "1_52", "label": ["abstract","cartoon"] }, + { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, + { "docid": "1_58", "label": ["abstract","art","cartoon"] }, + { "docid": "1_68", "label": ["design"] }, + { "docid": "1_69", "label": ["geometry"] }, + { "docid": "1_70", "label2": ["geometry", 1.2] }, + { "docid": "1_71", "label2": ["design", 2.2] }, + { "docid": "1_72", "label2": ["geometry", 1.2] } + ]), + ) + .unwrap(); + + delete_documents(&mut wtxn, &index, &["1_4", "1_70", "1_72"]); + + // Placeholder search with filter + let filter = Filter::from_str("label = sign").unwrap().unwrap(); + let results = index.search(&wtxn).filter(filter).execute().unwrap(); + assert!(results.documents_ids.is_empty()); + + wtxn.commit().unwrap(); + + db_snap!(index, word_docids); + db_snap!(index, facet_id_f64_docids); + db_snap!(index, word_pair_proximity_docids); + db_snap!(index, facet_id_exists_docids); + db_snap!(index, facet_id_string_docids); + } + + #[test] + fn placeholder_search_should_not_return_deleted_documents() { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("docid")); + }) + .unwrap(); + + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "docid": "1_4", "label": ["sign"] }, + { "docid": "1_5", "label": ["letter"] }, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, + { "docid": "1_36", "label": ["drawing","painting","pattern"] }, + { "docid": "1_37", "label": ["art","drawing","outdoor"] }, + { "docid": "1_38", "label": ["aquarium","art","drawing"] }, + { "docid": "1_39", "label": ["abstract"] }, + { "docid": "1_40", "label": ["cartoon"] }, + { "docid": "1_41", "label": ["art","drawing"] }, + { "docid": "1_42", "label": ["art","pattern"] }, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, + { "docid": "1_44", "label": ["drawing"] }, + { "docid": "1_45", "label": ["art"] }, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, + { "docid": "1_47", "label": ["abstract","pattern"] }, + { "docid": "1_52", "label": ["abstract","cartoon"] }, + { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, + { "docid": "1_58", "label": ["abstract","art","cartoon"] }, + { "docid": "1_68", "label": ["design"] }, + { "docid": "1_69", "label": ["geometry"] }, + { "docid": "1_70", "label2": ["geometry", 1.2] }, + { "docid": "1_71", "label2": ["design", 2.2] }, + { "docid": "1_72", "label2": ["geometry", 1.2] } + ]), + ) + .unwrap(); + + let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_4"]); + + // Placeholder search + let results = index.search(&wtxn).execute().unwrap(); + assert!(!results.documents_ids.is_empty()); + for id in results.documents_ids.iter() { + assert!( + !deleted_internal_ids.contains(id), + "The document {} was supposed to be deleted", + id + ); + } + + wtxn.commit().unwrap(); + } + + #[test] + fn search_should_not_return_deleted_documents() { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("docid")); + }) + .unwrap(); + + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "docid": "1_4", "label": ["sign"] }, + { "docid": "1_5", "label": ["letter"] }, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, + { "docid": "1_36", "label": ["drawing","painting","pattern"] }, + { "docid": "1_37", "label": ["art","drawing","outdoor"] }, + { "docid": "1_38", "label": ["aquarium","art","drawing"] }, + { "docid": "1_39", "label": ["abstract"] }, + { "docid": "1_40", "label": ["cartoon"] }, + { "docid": "1_41", "label": ["art","drawing"] }, + { "docid": "1_42", "label": ["art","pattern"] }, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, + { "docid": "1_44", "label": ["drawing"] }, + { "docid": "1_45", "label": ["art"] }, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, + { "docid": "1_47", "label": ["abstract","pattern"] }, + { "docid": "1_52", "label": ["abstract","cartoon"] }, + { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, + { "docid": "1_58", "label": ["abstract","art","cartoon"] }, + { "docid": "1_68", "label": ["design"] }, + { "docid": "1_69", "label": ["geometry"] }, + { "docid": "1_70", "label2": ["geometry", 1.2] }, + { "docid": "1_71", "label2": ["design", 2.2] }, + { "docid": "1_72", "label2": ["geometry", 1.2] } + ]), + ) + .unwrap(); + + let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_7", "1_52"]); + + // search for abstract + let results = index.search(&wtxn).query("abstract").execute().unwrap(); + assert!(!results.documents_ids.is_empty()); + for id in results.documents_ids.iter() { + assert!( + !deleted_internal_ids.contains(id), + "The document {} was supposed to be deleted", + id + ); + } + + wtxn.commit().unwrap(); + } + + #[test] + fn geo_filtered_placeholder_search_should_not_return_deleted_documents() { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("id")); + settings.set_filterable_fields(hashset!(S("_geo"))); + settings.set_sortable_fields(hashset!(S("_geo"))); + }) + .unwrap(); + + index.add_documents_using_wtxn(&mut wtxn, documents!([ + { "id": "1", "city": "Lille", "_geo": { "lat": 50.6299, "lng": 3.0569 } }, + { "id": "2", "city": "Mons-en-Barœul", "_geo": { "lat": 50.6415, "lng": 3.1106 } }, + { "id": "3", "city": "Hellemmes", "_geo": { "lat": 50.6312, "lng": 3.1106 } }, + { "id": "4", "city": "Villeneuve-d'Ascq", "_geo": { "lat": 50.6224, "lng": 3.1476 } }, + { "id": "5", "city": "Hem", "_geo": { "lat": 50.6552, "lng": 3.1897 } }, + { "id": "6", "city": "Roubaix", "_geo": { "lat": 50.6924, "lng": 3.1763 } }, + { "id": "7", "city": "Tourcoing", "_geo": { "lat": 50.7263, "lng": 3.1541 } }, + { "id": "8", "city": "Mouscron", "_geo": { "lat": 50.7453, "lng": 3.2206 } }, + { "id": "9", "city": "Tournai", "_geo": { "lat": 50.6053, "lng": 3.3758 } }, + { "id": "10", "city": "Ghent", "_geo": { "lat": 51.0537, "lng": 3.6957 } }, + { "id": "11", "city": "Brussels", "_geo": { "lat": 50.8466, "lng": 4.3370 } }, + { "id": "12", "city": "Charleroi", "_geo": { "lat": 50.4095, "lng": 4.4347 } }, + { "id": "13", "city": "Mons", "_geo": { "lat": 50.4502, "lng": 3.9623 } }, + { "id": "14", "city": "Valenciennes", "_geo": { "lat": 50.3518, "lng": 3.5326 } }, + { "id": "15", "city": "Arras", "_geo": { "lat": 50.2844, "lng": 2.7637 } }, + { "id": "16", "city": "Cambrai", "_geo": { "lat": 50.1793, "lng": 3.2189 } }, + { "id": "17", "city": "Bapaume", "_geo": { "lat": 50.1112, "lng": 2.8547 } }, + { "id": "18", "city": "Amiens", "_geo": { "lat": 49.9314, "lng": 2.2710 } }, + { "id": "19", "city": "Compiègne", "_geo": { "lat": 49.4449, "lng": 2.7913 } }, + { "id": "20", "city": "Paris", "_geo": { "lat": 48.9021, "lng": 2.3708 } } + ])).unwrap(); + + let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"]; + let deleted_internal_ids = delete_documents(&mut wtxn, &index, &external_ids_to_delete); + + // Placeholder search with geo filter + let filter = Filter::from_str("_geoRadius(50.6924, 3.1763, 20000)").unwrap().unwrap(); + let results = index.search(&wtxn).filter(filter).execute().unwrap(); + assert!(!results.documents_ids.is_empty()); + for id in results.documents_ids.iter() { + assert!( + !deleted_internal_ids.contains(id), + "The document {} was supposed to be deleted", + id + ); + } + + wtxn.commit().unwrap(); + + db_snap!(index, facet_id_f64_docids); + db_snap!(index, facet_id_string_docids); + } + + #[test] + fn get_documents_should_not_return_deleted_documents() { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("docid")); + }) + .unwrap(); + + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "docid": "1_4", "label": ["sign"] }, + { "docid": "1_5", "label": ["letter"] }, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, + { "docid": "1_36", "label": ["drawing","painting","pattern"] }, + { "docid": "1_37", "label": ["art","drawing","outdoor"] }, + { "docid": "1_38", "label": ["aquarium","art","drawing"] }, + { "docid": "1_39", "label": ["abstract"] }, + { "docid": "1_40", "label": ["cartoon"] }, + { "docid": "1_41", "label": ["art","drawing"] }, + { "docid": "1_42", "label": ["art","pattern"] }, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, + { "docid": "1_44", "label": ["drawing"] }, + { "docid": "1_45", "label": ["art"] }, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, + { "docid": "1_47", "label": ["abstract","pattern"] }, + { "docid": "1_52", "label": ["abstract","cartoon"] }, + { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, + { "docid": "1_58", "label": ["abstract","art","cartoon"] }, + { "docid": "1_68", "label": ["design"] }, + { "docid": "1_69", "label": ["geometry"] }, + { "docid": "1_70", "label2": ["geometry", 1.2] }, + { "docid": "1_71", "label2": ["design", 2.2] }, + { "docid": "1_72", "label2": ["geometry", 1.2] } + ]), + ) + .unwrap(); + + let deleted_external_ids = ["1_7", "1_52"]; + let deleted_internal_ids = delete_documents(&mut wtxn, &index, &deleted_external_ids); + + // list all documents + let results = index.all_documents(&wtxn).unwrap(); + for result in results { + let (id, _) = result.unwrap(); + assert!( + !deleted_internal_ids.contains(&id), + "The document {} was supposed to be deleted", + id + ); + } + + // list internal document ids + let results = index.documents_ids(&wtxn).unwrap(); + for id in results { + assert!( + !deleted_internal_ids.contains(&id), + "The document {} was supposed to be deleted", + id + ); + } + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + + // get internal docids from deleted external document ids + let results = index.external_documents_ids(&rtxn).unwrap(); + for id in deleted_external_ids { + assert!(results.get(id).is_none(), "The document {} was supposed to be deleted", id); + } + drop(rtxn); + } + + #[test] + fn stats_should_not_return_deleted_documents() { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("docid")); + }) + .unwrap(); + + index.add_documents_using_wtxn(&mut wtxn, documents!([ + { "docid": "1_4", "label": ["sign"]}, + { "docid": "1_5", "label": ["letter"]}, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"], "title": "Mickey Mouse"}, + { "docid": "1_36", "label": ["drawing","painting","pattern"]}, + { "docid": "1_37", "label": ["art","drawing","outdoor"]}, + { "docid": "1_38", "label": ["aquarium","art","drawing"], "title": "Nemo"}, + { "docid": "1_39", "label": ["abstract"]}, + { "docid": "1_40", "label": ["cartoon"]}, + { "docid": "1_41", "label": ["art","drawing"]}, + { "docid": "1_42", "label": ["art","pattern"]}, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"], "number": 32i32}, + { "docid": "1_44", "label": ["drawing"], "number": 44i32}, + { "docid": "1_45", "label": ["art"]}, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"]}, + { "docid": "1_47", "label": ["abstract","pattern"]}, + { "docid": "1_52", "label": ["abstract","cartoon"]}, + { "docid": "1_57", "label": ["abstract","drawing","pattern"]}, + { "docid": "1_58", "label": ["abstract","art","cartoon"]}, + { "docid": "1_68", "label": ["design"]}, + { "docid": "1_69", "label": ["geometry"]} + ])).unwrap(); + + delete_documents(&mut wtxn, &index, &["1_7", "1_52"]); + + // count internal documents + let results = index.number_of_documents(&wtxn).unwrap(); + assert_eq!(18, results); + + // count field distribution + let results = index.field_distribution(&wtxn).unwrap(); + assert_eq!(Some(&18), results.get("label")); + assert_eq!(Some(&1), results.get("title")); + assert_eq!(Some(&2), results.get("number")); + + wtxn.commit().unwrap(); + } + + #[test] + fn stored_detected_script_and_language_should_not_return_deleted_documents() { + use charabia::{Language, Script}; + let index = TempIndex::new(); + let mut wtxn = index.write_txn().unwrap(); + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "id": "0", "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" }, + { "id": "1", "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" }, + { "id": "2", "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" }, + { "id": "3", "title": "関西国際空港限定トートバッグ すもももももももものうち" }, + { "id": "4", "title": "ภาษาไทยง่ายนิดเดียว" }, + { "id": "5", "title": "The quick 在尊嚴和權利上一律平等。" }, + ])) + .unwrap(); + + let key_cmn = (Script::Cj, Language::Cmn); + let cj_cmn_docs = + index.script_language_documents_ids(&wtxn, &key_cmn).unwrap().unwrap_or_default(); + let mut expected_cj_cmn_docids = RoaringBitmap::new(); + expected_cj_cmn_docids.push(1); + expected_cj_cmn_docids.push(5); + assert_eq!(cj_cmn_docs, expected_cj_cmn_docids); + + delete_documents(&mut wtxn, &index, &["1"]); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + let cj_cmn_docs = + index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap_or_default(); + let mut expected_cj_cmn_docids = RoaringBitmap::new(); + expected_cj_cmn_docids.push(5); + assert_eq!(cj_cmn_docs, expected_cj_cmn_docids); + } + + #[test] + fn delete_words_exact_attributes() { + let index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_primary_key(S("id")); + settings.set_searchable_fields(vec![S("text"), S("exact")]); + settings.set_exact_attributes(vec![S("exact")].into_iter().collect()); + }) + .unwrap(); + + index + .add_documents(documents!([ + { "id": 0, "text": "hello" }, + { "id": 1, "exact": "hello"} + ])) + .unwrap(); + db_snap!(index, word_docids, 1, @r###" + hello [0, ] + "###); + db_snap!(index, exact_word_docids, 1, @r###" + hello [1, ] + "###); + db_snap!(index, words_fst, 1, @"300000000000000001084cfcfc2ce1000000016000000090ea47f"); + + let mut wtxn = index.write_txn().unwrap(); + let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1"]); + wtxn.commit().unwrap(); + + db_snap!(index, word_docids, 2, @r###" + hello [0, ] + "###); + db_snap!(index, exact_word_docids, 2, @""); + db_snap!(index, words_fst, 2, @"300000000000000001084cfcfc2ce1000000016000000090ea47f"); + + insta::assert_snapshot!(format!("{deleted_internal_ids:?}"), @"[1]"); + let txn = index.read_txn().unwrap(); + let words = index.words_fst(&txn).unwrap().into_stream().into_strs().unwrap(); + insta::assert_snapshot!(format!("{words:?}"), @r###"["hello"]"###); + + let mut s = Search::new(&txn, &index); + s.query("hello"); + let crate::SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); + } } From 6260cff65ff435aae61878c45275e0d8922546c9 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 26 Oct 2023 18:06:41 +0200 Subject: [PATCH 067/127] Actually delete documents from DB when the merge function says so --- .../cbo_roaring_bitmap_codec.rs | 13 +++++++--- milli/src/update/index_documents/mod.rs | 17 +----------- .../src/update/index_documents/typed_chunk.rs | 26 ++++++++++--------- 3 files changed, 24 insertions(+), 32 deletions(-) diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index 117da1308..f635e55af 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -102,11 +102,11 @@ impl CboRoaringBitmapCodec { } /// Merges a DelAdd delta into a CboRoaringBitmap. - pub fn merge_deladd_into( + pub fn merge_deladd_into<'a>( deladd: KvReaderDelAdd<'_>, previous: &[u8], - buffer: &mut Vec, - ) -> io::Result<()> { + buffer: &'a mut Vec, + ) -> io::Result> { // Deserialize the bitmap that is already there let mut previous = Self::deserialize_from(previous)?; @@ -120,7 +120,12 @@ impl CboRoaringBitmapCodec { previous |= Self::deserialize_from(value)?; } - previous.serialize_into(buffer) + if previous.is_empty() { + return Ok(None); + } + + Self::serialize_into(&previous, buffer); + Ok(Some(&buffer[..])) } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index b439ca409..45ceec7b0 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -390,22 +390,7 @@ where return Err(Error::InternalError(InternalError::AbortedIndexation)); } - let typed_chunk = match result? { - TypedChunk::WordDocids { - word_docids_reader, - exact_word_docids_reader, - word_fid_docids_reader, - } => TypedChunk::WordDocids { - word_docids_reader, - exact_word_docids_reader, - word_fid_docids_reader, - }, - TypedChunk::WordPairProximityDocids(chunk) => { - TypedChunk::WordPairProximityDocids(chunk) - } - TypedChunk::WordPositionDocids(chunk) => TypedChunk::WordPositionDocids(chunk), - otherwise => otherwise, - }; + let typed_chunk = result?; // FIXME: return newly added as well as newly deleted documents let (docids, is_merged_database) = diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 1f1ac4adf..8257f7c93 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -484,11 +484,11 @@ fn deladd_serialize_add_side<'a>(obkv: &'a [u8], _buffer: &mut Vec) -> Resul /// /// The first argument is the DelAdd obkv of CboRoaringBitmaps and /// the second one is the CboRoaringBitmap to merge into. -fn merge_deladd_cbo_roaring_bitmaps( +fn merge_deladd_cbo_roaring_bitmaps<'a>( deladd_obkv: &[u8], previous: &[u8], - buffer: &mut Vec, -) -> Result<()> { + buffer: &'a mut Vec, +) -> Result> { Ok(CboRoaringBitmapCodec::merge_deladd_into( KvReaderDelAdd::new(deladd_obkv), previous, @@ -509,7 +509,7 @@ fn write_entries_into_database( where R: io::Read + io::Seek, FS: for<'a> Fn(&'a [u8], &'a mut Vec) -> Result<&'a [u8]>, - FM: Fn(&[u8], &[u8], &mut Vec) -> Result<()>, + FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec) -> Result>, { puffin::profile_function!(format!("number of entries: {}", data.len())); @@ -521,17 +521,19 @@ where if valid_lmdb_key(key) { buffer.clear(); let value = if index_is_empty { - serialize_value(value, &mut buffer)? + Some(serialize_value(value, &mut buffer)?) } else { match database.get(wtxn, key)? { - Some(prev_value) => { - merge_values(value, prev_value, &mut buffer)?; - &buffer[..] - } - None => serialize_value(value, &mut buffer)?, + Some(prev_value) => merge_values(value, prev_value, &mut buffer)?, + None => Some(serialize_value(value, &mut buffer)?), } }; - database.put(wtxn, key, value)?; + match value { + Some(value) => database.put(wtxn, key, value)?, + None => { + database.delete(wtxn, key)?; + } + } } } @@ -553,7 +555,7 @@ fn append_entries_into_database( where R: io::Read + io::Seek, FS: for<'a> Fn(&'a [u8], &'a mut Vec) -> Result<&'a [u8]>, - FM: Fn(&[u8], &[u8], &mut Vec) -> Result<()>, + FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec) -> Result>, K: for<'a> heed::BytesDecode<'a>, { puffin::profile_function!(format!("number of entries: {}", data.len())); From fdf3f7f627aad98ecbd599e04230a505907f97c2 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 26 Oct 2023 18:22:03 +0200 Subject: [PATCH 068/127] Fix facet distribution test --- .../src/update/index_documents/typed_chunk.rs | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 8257f7c93..192f3d139 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -328,8 +328,18 @@ pub(crate) fn write_typed_chunk_into_index( index.field_id_docid_facet_f64s.remap_types::(); let mut cursor = fid_docid_facet_number.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { + let reader = KvReaderDelAdd::new(value); if valid_lmdb_key(key) { - index_fid_docid_facet_numbers.put(wtxn, key, value)?; + match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) { + (None, None) => {} + (None, Some(new)) => index_fid_docid_facet_numbers.put(wtxn, key, new)?, + (Some(_), None) => { + index_fid_docid_facet_numbers.delete(wtxn, key)?; + } + (Some(_), Some(new)) => { + index_fid_docid_facet_numbers.put(wtxn, key, new)? + } + } } } } @@ -338,8 +348,18 @@ pub(crate) fn write_typed_chunk_into_index( index.field_id_docid_facet_strings.remap_types::(); let mut cursor = fid_docid_facet_string.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { + let reader = KvReaderDelAdd::new(value); if valid_lmdb_key(key) { - index_fid_docid_facet_strings.put(wtxn, key, value)?; + match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) { + (None, None) => {} + (None, Some(new)) => index_fid_docid_facet_strings.put(wtxn, key, new)?, + (Some(_), None) => { + index_fid_docid_facet_strings.delete(wtxn, key)?; + } + (Some(_), Some(new)) => { + index_fid_docid_facet_strings.put(wtxn, key, new)? + } + } } } } From dfab6293c9f8829c41833e95d92963a6323f9b03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 28 Oct 2023 12:56:46 +0200 Subject: [PATCH 069/127] Use an LMDB database to store the external documents ids --- index-scheduler/src/batch.rs | 7 +- meilisearch/src/routes/indexes/documents.rs | 4 +- milli/src/external_documents_ids.rs | 157 ++++++------------ milli/src/index.rs | 33 ++-- milli/src/update/clear_documents.rs | 5 +- milli/src/update/index_documents/transform.rs | 8 +- .../src/update/index_documents/typed_chunk.rs | 6 +- 7 files changed, 79 insertions(+), 141 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index c4f9c12be..c273d8ebb 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -1575,11 +1575,14 @@ fn delete_document_by_filter<'a>( } e => e.into(), })?; - let external_documents_ids = index.external_documents_ids(wtxn)?; + let external_documents_ids = index.external_documents_ids(); // FIXME: for filters matching a lot of documents, this will allocate a huge vec of external docids (strings). // Since what we have is an iterator, it would be better to delete in chunks let external_to_internal: std::result::Result, RoaringBitmap> = - external_documents_ids.find_external_id_of(candidates).only_external_ids().collect(); + external_documents_ids + .find_external_id_of(wtxn, candidates)? + .only_external_ids() + .collect(); let document_ids = match external_to_internal { Ok(external_ids) => external_ids, Err(remaining_ids) => panic!("Couldn't find some external ids {:?}", remaining_ids), diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 2afc1b5fb..b6950ae6e 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -612,8 +612,8 @@ fn retrieve_document>( let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); let internal_id = index - .external_documents_ids(&txn)? - .get(doc_id.as_bytes()) + .external_documents_ids() + .get(&txn, doc_id)? .ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?; let document = index diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs index 02794609f..1bf08396a 100644 --- a/milli/src/external_documents_ids.rs +++ b/milli/src/external_documents_ids.rs @@ -1,12 +1,11 @@ -use std::borrow::Cow; use std::collections::HashMap; use std::convert::TryInto; -use std::fmt; -use fst::Streamer; +use heed::types::{OwnedType, Str}; +use heed::{Database, RoIter, RoTxn, RwTxn}; use roaring::RoaringBitmap; -use crate::DocumentId; +use crate::{DocumentId, BEU32}; pub enum DocumentOperationKind { Create, @@ -19,41 +18,31 @@ pub struct DocumentOperation { pub kind: DocumentOperationKind, } -pub struct ExternalDocumentsIds<'a>(fst::Map>); +pub struct ExternalDocumentsIds(Database>); -impl<'a> ExternalDocumentsIds<'a> { - pub fn new(fst: fst::Map>) -> ExternalDocumentsIds<'a> { - ExternalDocumentsIds(fst) - } - - pub fn into_static(self) -> ExternalDocumentsIds<'static> { - ExternalDocumentsIds(self.0.map_data(|c| Cow::Owned(c.into_owned())).unwrap()) +impl ExternalDocumentsIds { + pub fn new(db: Database>) -> ExternalDocumentsIds { + ExternalDocumentsIds(db) } /// Returns `true` if hard and soft external documents lists are empty. - pub fn is_empty(&self) -> bool { - self.0.is_empty() + pub fn is_empty(&self, rtxn: &RoTxn) -> heed::Result { + self.0.is_empty(rtxn).map_err(Into::into) } - pub fn get>(&self, external_id: A) -> Option { - let external_id = external_id.as_ref(); - self.0.get(external_id).map(|x| x.try_into().unwrap()) + pub fn get>(&self, rtxn: &RoTxn, external_id: A) -> heed::Result> { + Ok(self.0.get(rtxn, external_id.as_ref())?.map(|x| x.get().try_into().unwrap())) } /// An helper function to debug this type, returns an `HashMap` of both, /// soft and hard fst maps, combined. - pub fn to_hash_map(&self) -> HashMap { + pub fn to_hash_map(&self, rtxn: &RoTxn) -> heed::Result> { let mut map = HashMap::default(); - let mut stream = self.0.stream(); - while let Some((k, v)) = stream.next() { - let k = String::from_utf8(k.to_vec()).unwrap(); - map.insert(k, v.try_into().unwrap()); + for result in self.0.iter(rtxn)? { + let (external, internal) = result?; + map.insert(external.to_owned(), internal.get().try_into().unwrap()); } - map - } - - pub fn as_bytes(&self) -> &[u8] { - self.0.as_fst().as_bytes() + Ok(map) } /// Looks for the internal ids in the passed bitmap, and returns an iterator over the mapping between @@ -65,12 +54,12 @@ impl<'a> ExternalDocumentsIds<'a> { /// - `Err(remaining_ids)`: if the external ids for some of the requested internal ids weren't found. /// In that case the returned bitmap contains the internal ids whose external ids were not found after traversing /// the entire fst. - pub fn find_external_id_of( + pub fn find_external_id_of<'t>( &self, + rtxn: &'t RoTxn, internal_ids: RoaringBitmap, - ) -> ExternalToInternalOwnedIterator<'_> { - let it = ExternalToInternalOwnedIterator { stream: self.0.stream(), internal_ids }; - it + ) -> heed::Result> { + self.0.iter(rtxn).map(|iter| ExternalToInternalOwnedIterator { iter, internal_ids }) } /// Applies the list of operations passed as argument, modifying the current external to internal id mapping. @@ -81,84 +70,39 @@ impl<'a> ExternalDocumentsIds<'a> { /// /// - If attempting to delete a document that doesn't exist /// - If attempting to create a document that already exists - pub fn apply(&mut self, mut operations: Vec) { - operations.sort_unstable_by(|left, right| left.external_id.cmp(&right.external_id)); - operations.dedup_by(|left, right| left.external_id == right.external_id); - - let mut builder = fst::MapBuilder::memory(); - - let mut stream = self.0.stream(); - let mut next_stream = stream.next(); - let mut operations = operations.iter(); - let mut next_operation = operations.next(); - - loop { - (next_stream, next_operation) = match (next_stream.take(), next_operation.take()) { - (None, None) => break, - (None, Some(DocumentOperation { external_id, internal_id, kind })) => { - if matches!(kind, DocumentOperationKind::Delete) { + pub fn apply(&self, wtxn: &mut RwTxn, operations: Vec) -> heed::Result<()> { + for DocumentOperation { external_id, internal_id, kind } in operations { + match kind { + DocumentOperationKind::Create => { + // TODO should we get before insert to be able to detect bugs? + // if matches!(kind, DocumentOperationKind::Create) { + // panic!("Attempting to create an already-existing document"); + // } + self.0.put(wtxn, &external_id, &BEU32::new(internal_id))?; + } + DocumentOperationKind::Delete => { + if !self.0.delete(wtxn, &external_id)? { panic!("Attempting to delete a non-existing document") } - builder.insert(external_id, (*internal_id).into()).unwrap(); - (None, operations.next()) } - (Some((k, v)), None) => { - builder.insert(k, v).unwrap(); - (stream.next(), None) - } - ( - current_stream @ Some((left_external_id, left_internal_id)), - current_operation @ Some(DocumentOperation { - external_id: right_external_id, - internal_id: right_internal_id, - kind, - }), - ) => match left_external_id.cmp(right_external_id.as_bytes()) { - std::cmp::Ordering::Less => { - builder.insert(left_external_id, left_internal_id).unwrap(); - (stream.next(), current_operation) - } - std::cmp::Ordering::Greater => { - builder.insert(right_external_id, (*right_internal_id).into()).unwrap(); - (current_stream, operations.next()) - } - std::cmp::Ordering::Equal => { - if matches!(kind, DocumentOperationKind::Create) { - panic!("Attempting to create an already-existing document"); - } - // we delete the document, so we just advance both iterators to skip in stream - (stream.next(), operations.next()) - } - }, } } - self.0 = builder.into_map().map_data(Cow::Owned).unwrap(); - } -} -impl fmt::Debug for ExternalDocumentsIds<'_> { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_tuple("ExternalDocumentsIds").field(&self.to_hash_map()).finish() - } -} - -impl Default for ExternalDocumentsIds<'static> { - fn default() -> Self { - ExternalDocumentsIds(fst::Map::default().map_data(Cow::Owned).unwrap()) + Ok(()) } } /// An iterator over mappings between requested internal ids and external ids. /// /// See [`ExternalDocumentsIds::find_external_id_of`] for details. -pub struct ExternalToInternalOwnedIterator<'it> { - stream: fst::map::Stream<'it>, +pub struct ExternalToInternalOwnedIterator<'t> { + iter: RoIter<'t, Str, OwnedType>, internal_ids: RoaringBitmap, } -impl<'it> Iterator for ExternalToInternalOwnedIterator<'it> { +impl<'t> Iterator for ExternalToInternalOwnedIterator<'t> { /// A result indicating if a mapping was found, or if the stream was exhausted without finding all internal ids. - type Item = Result<(String, DocumentId), RoaringBitmap>; + type Item = Result<(&'t str, DocumentId), RoaringBitmap>; fn next(&mut self) -> Option { // if all requested ids were found, we won't find any other, so short-circuit @@ -166,23 +110,28 @@ impl<'it> Iterator for ExternalToInternalOwnedIterator<'it> { return None; } loop { - let Some((external, internal)) = self.stream.next() else { - // we exhausted the stream but we still have some internal ids to find - let remaining_ids = std::mem::take(&mut self.internal_ids); - return Some(Err(remaining_ids)); - // note: next calls to `next` will return `None` since we replaced the internal_ids - // with the default empty bitmap + let (external, internal) = match self.iter.next() { + Some(Ok((external, internal))) => (external, internal), + // TODO manage this better, remove panic + Some(Err(e)) => panic!("{}", e), + _ => { + // we exhausted the stream but we still have some internal ids to find + let remaining_ids = std::mem::take(&mut self.internal_ids); + return Some(Err(remaining_ids)); + // note: next calls to `next` will return `None` since we replaced the internal_ids + // with the default empty bitmap + } }; - let internal = internal.try_into().unwrap(); + let internal = internal.get(); let was_contained = self.internal_ids.remove(internal); if was_contained { - return Some(Ok((std::str::from_utf8(external).unwrap().to_owned(), internal))); + return Some(Ok((external, internal))); } } } } -impl<'it> ExternalToInternalOwnedIterator<'it> { +impl<'t> ExternalToInternalOwnedIterator<'t> { /// Returns the bitmap of internal ids whose external id are yet to be found pub fn remaining_internal_ids(&self) -> &RoaringBitmap { &self.internal_ids @@ -191,7 +140,7 @@ impl<'it> ExternalToInternalOwnedIterator<'it> { /// Consumes this iterator and returns an iterator over only the external ids, ignoring the internal ids. /// /// Use this when you don't need the mapping between the external and the internal ids. - pub fn only_external_ids(self) -> impl Iterator> + 'it { - self.map(|res| res.map(|(external, _internal)| external)) + pub fn only_external_ids(self) -> impl Iterator> + 't { + self.map(|res| res.map(|(external, _internal)| external.to_owned())) } } diff --git a/milli/src/index.rs b/milli/src/index.rs index d99c36b65..f8a37fb2b 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -51,7 +51,6 @@ pub mod main_key { /// It is concatenated with a big-endian encoded number (non-human readable). /// e.g. vector-hnsw0x0032. pub const VECTOR_HNSW_KEY_PREFIX: &str = "vector-hnsw"; - pub const EXTERNAL_DOCUMENTS_IDS_KEY: &str = "external-documents-ids"; pub const PRIMARY_KEY_KEY: &str = "primary-key"; pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields"; pub const USER_DEFINED_SEARCHABLE_FIELDS_KEY: &str = "user-defined-searchable-fields"; @@ -81,6 +80,7 @@ pub mod db_name { pub const EXACT_WORD_DOCIDS: &str = "exact-word-docids"; pub const WORD_PREFIX_DOCIDS: &str = "word-prefix-docids"; pub const EXACT_WORD_PREFIX_DOCIDS: &str = "exact-word-prefix-docids"; + pub const EXTERNAL_DOCUMENTS_IDS: &str = "external-documents-ids"; pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions"; pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids"; pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids"; @@ -112,6 +112,9 @@ pub struct Index { /// Contains many different types (e.g. the fields ids map). pub(crate) main: PolyDatabase, + /// Maps the external documents ids with the internal document id. + pub external_documents_ids: Database>, + /// A word and all the documents ids containing the word. pub word_docids: Database, @@ -183,13 +186,15 @@ impl Index { ) -> Result { use db_name::*; - options.max_dbs(25); + options.max_dbs(26); unsafe { options.flag(Flags::MdbAlwaysFreePages) }; let env = options.open(path)?; let mut wtxn = env.write_txn()?; let main = env.create_poly_database(&mut wtxn, Some(MAIN))?; let word_docids = env.create_database(&mut wtxn, Some(WORD_DOCIDS))?; + let external_documents_ids = + env.create_database(&mut wtxn, Some(EXTERNAL_DOCUMENTS_IDS))?; let exact_word_docids = env.create_database(&mut wtxn, Some(EXACT_WORD_DOCIDS))?; let word_prefix_docids = env.create_database(&mut wtxn, Some(WORD_PREFIX_DOCIDS))?; let exact_word_prefix_docids = @@ -235,6 +240,7 @@ impl Index { Ok(Index { env, main, + external_documents_ids, word_docids, exact_word_docids, word_prefix_docids, @@ -386,29 +392,10 @@ impl Index { /* external documents ids */ - /// Writes the external documents ids and internal ids (i.e. `u32`). - pub(crate) fn put_external_documents_ids( - &self, - wtxn: &mut RwTxn, - external_documents_ids: &ExternalDocumentsIds<'_>, - ) -> heed::Result<()> { - self.main.put::<_, Str, ByteSlice>( - wtxn, - main_key::EXTERNAL_DOCUMENTS_IDS_KEY, - external_documents_ids.as_bytes(), - )?; - Ok(()) - } - /// Returns the external documents ids map which associate the external ids /// with the internal ids (i.e. `u32`). - pub fn external_documents_ids<'t>(&self, rtxn: &'t RoTxn) -> Result> { - let fst = self.main.get::<_, Str, ByteSlice>(rtxn, main_key::EXTERNAL_DOCUMENTS_IDS_KEY)?; - let fst = match fst { - Some(fst) => fst::Map::new(fst)?.map_data(Cow::Borrowed)?, - None => fst::Map::default().map_data(Cow::Owned)?, - }; - Ok(ExternalDocumentsIds::new(fst)) + pub fn external_documents_ids(&self) -> ExternalDocumentsIds { + ExternalDocumentsIds::new(self.external_documents_ids) } /* fields ids map */ diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index ca5f69808..7f528e928 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -1,7 +1,7 @@ use roaring::RoaringBitmap; use time::OffsetDateTime; -use crate::{ExternalDocumentsIds, FieldDistribution, Index, Result}; +use crate::{FieldDistribution, Index, Result}; pub struct ClearDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -20,6 +20,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { let Index { env: _env, main: _main, + external_documents_ids, word_docids, exact_word_docids, word_prefix_docids, @@ -54,7 +55,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { // We clean some of the main engine datastructures. self.index.put_words_fst(self.wtxn, &fst::Set::default())?; self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?; - self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?; self.index.put_documents_ids(self.wtxn, &empty_roaring)?; self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?; self.index.delete_geo_rtree(self.wtxn)?; @@ -62,6 +62,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { self.index.delete_vector_hnsw(self.wtxn)?; // Clear the other databases. + external_documents_ids.clear(self.wtxn)?; word_docids.clear(self.wtxn)?; exact_word_docids.clear(self.wtxn)?; word_prefix_docids.clear(self.wtxn)?; diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 872230d99..98079e07b 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -162,7 +162,7 @@ impl<'a, 'i> Transform<'a, 'i> { FA: Fn() -> bool + Sync, { let (mut cursor, fields_index) = reader.into_cursor_and_fields_index(); - let external_documents_ids = self.index.external_documents_ids(wtxn)?; + let external_documents_ids = self.index.external_documents_ids(); let mapping = create_fields_mapping(&mut self.fields_ids_map, &fields_index)?; let primary_key = cursor.primary_key().to_string(); @@ -221,7 +221,7 @@ impl<'a, 'i> Transform<'a, 'i> { let docid = match self.new_external_documents_ids_builder.entry((*external_id).into()) { Entry::Occupied(entry) => *entry.get() as u32, Entry::Vacant(entry) => { - let docid = match external_documents_ids.get(entry.key()) { + let docid = match external_documents_ids.get(wtxn, entry.key())? { Some(docid) => { // If it was already in the list of replaced documents it means it was deleted // by the remove_document method. We should starts as if it never existed. @@ -373,7 +373,7 @@ impl<'a, 'i> Transform<'a, 'i> { to_remove.sort_unstable(); to_remove.dedup(); - let external_documents_ids = self.index.external_documents_ids(wtxn)?; + let external_documents_ids = self.index.external_documents_ids(); let mut documents_deleted = 0; let mut document_sorter_buffer = Vec::new(); @@ -410,7 +410,7 @@ impl<'a, 'i> Transform<'a, 'i> { // If the document was already in the db we mark it as a `to_delete` document. // Then we push the document in sorters in deletion mode. - let deleted_from_db = match external_documents_ids.get(&to_remove) { + let deleted_from_db = match external_documents_ids.get(wtxn, &to_remove)? { Some(docid) => { self.replaced_documents_ids.insert(docid); diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 192f3d139..1b38be03b 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -194,10 +194,8 @@ pub(crate) fn write_typed_chunk_into_index( db.delete(wtxn, &BEU32::new(docid))?; } } - let mut external_documents_docids = index.external_documents_ids(wtxn)?.into_static(); - external_documents_docids.apply(operations); - index.put_external_documents_ids(wtxn, &external_documents_docids)?; - + let external_documents_docids = index.external_documents_ids(); + external_documents_docids.apply(wtxn, operations)?; index.put_documents_ids(wtxn, &docids)?; } TypedChunk::FieldIdWordCountDocids(fid_word_count_docids_iter) => { From abf424ebfc1addeb60ad897e9bf210e9d4a38e04 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 30 Oct 2023 11:41:56 +0100 Subject: [PATCH 070/127] Remove unused FromIterator --- milli/src/update/index_documents/extract/extract_word_docids.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index a95162236..5266e9bff 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -1,7 +1,6 @@ use std::collections::{BTreeSet, HashSet}; use std::fs::File; use std::io::{self, BufReader}; -use std::iter::FromIterator; use heed::BytesDecode; use obkv::KvReaderU16; From 58690dfb19971fa4d5dc949135e987fc19ea4b63 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 30 Oct 2023 13:34:07 +0100 Subject: [PATCH 071/127] Fix tests compilation after changes to ExternalDocumentsIds API --- milli/src/snapshot_tests.rs | 2 +- milli/src/update/clear_documents.rs | 2 +- milli/src/update/index_documents/mod.rs | 26 ++++++++++++++----------- milli/tests/search/mod.rs | 8 +++++--- 4 files changed, 22 insertions(+), 16 deletions(-) diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index 730d0a5c8..f3f1eb5a5 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -332,7 +332,7 @@ pub fn snap_geo_faceted_documents_ids(index: &Index) -> String { } pub fn snap_external_documents_ids(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); - let external_ids = index.external_documents_ids(&rtxn).unwrap().to_hash_map(); + let external_ids = index.external_documents_ids().to_hash_map(&rtxn).unwrap(); // ensure fixed order (not guaranteed by hashmap) let mut external_ids: Vec<(String, u32)> = external_ids.into_iter().collect(); external_ids.sort_by(|(l, _), (r, _)| l.cmp(r)); diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 7f528e928..265c6f15a 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -122,7 +122,7 @@ mod tests { assert!(index.words_fst(&rtxn).unwrap().is_empty()); assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty()); - assert!(index.external_documents_ids(&rtxn).unwrap().is_empty()); + assert!(index.external_documents_ids().is_empty(&rtxn).unwrap()); assert!(index.documents_ids(&rtxn).unwrap().is_empty()); assert!(index.field_distribution(&rtxn).unwrap().is_empty()); assert!(index.geo_rtree(&rtxn).unwrap().is_none()); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 45ceec7b0..3026ce81c 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1094,8 +1094,8 @@ mod tests { let txn = index.read_txn().unwrap(); assert_eq!(index.primary_key(&txn).unwrap(), Some("objectId")); - let external_documents_ids = index.external_documents_ids(&txn).unwrap(); - assert!(external_documents_ids.get("30").is_none()); + let external_documents_ids = index.external_documents_ids(); + assert!(external_documents_ids.get(&txn, "30").unwrap().is_none()); index .add_documents(documents!([ @@ -1104,8 +1104,8 @@ mod tests { .unwrap(); let wtxn = index.write_txn().unwrap(); - let external_documents_ids = index.external_documents_ids(&wtxn).unwrap(); - assert!(external_documents_ids.get("30").is_some()); + let external_documents_ids = index.external_documents_ids(); + assert!(external_documents_ids.get(&wtxn, "30").unwrap().is_some()); wtxn.commit().unwrap(); index @@ -1399,8 +1399,8 @@ mod tests { index.add_documents(documents!({ "a" : { "b" : { "c" : 1 }}})).unwrap(); let rtxn = index.read_txn().unwrap(); - let external_documents_ids = index.external_documents_ids(&rtxn).unwrap(); - assert!(external_documents_ids.get("1").is_some()); + let external_documents_ids = index.external_documents_ids(); + assert!(external_documents_ids.get(&rtxn, "1").unwrap().is_some()); } #[test] @@ -1665,7 +1665,7 @@ mod tests { let wtxn = index.read_txn().unwrap(); - let map = index.external_documents_ids(&wtxn).unwrap().to_hash_map(); + let map = index.external_documents_ids().to_hash_map(&wtxn).unwrap(); let ids = map.values().collect::>(); assert_eq!(ids.len(), map.len()); @@ -2669,10 +2669,10 @@ mod tests { index: &'t TempIndex, external_ids: &[&str], ) -> Vec { - let external_document_ids = index.external_documents_ids(wtxn).unwrap(); + let external_document_ids = index.external_documents_ids(); let ids_to_delete: Vec = external_ids .iter() - .map(|id| external_document_ids.get(id.as_bytes()).unwrap()) + .map(|id| external_document_ids.get(&wtxn, id).unwrap().unwrap()) .collect(); // Delete some documents. @@ -3052,9 +3052,13 @@ mod tests { let rtxn = index.read_txn().unwrap(); // get internal docids from deleted external document ids - let results = index.external_documents_ids(&rtxn).unwrap(); + let results = index.external_documents_ids(); for id in deleted_external_ids { - assert!(results.get(id).is_none(), "The document {} was supposed to be deleted", id); + assert!( + results.get(&rtxn, id).unwrap().is_none(), + "The document {} was supposed to be deleted", + id + ); } drop(rtxn); } diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 1c68cfff2..9193ab762 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -88,9 +88,11 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { pub fn internal_to_external_ids(index: &Index, internal_ids: &[DocumentId]) -> Vec { let rtxn = index.read_txn().unwrap(); - let docid_map = index.external_documents_ids(&rtxn).unwrap(); - let docid_map: std::collections::HashMap<_, _> = - EXTERNAL_DOCUMENTS_IDS.iter().map(|id| (docid_map.get(id).unwrap(), id)).collect(); + let docid_map = index.external_documents_ids(); + let docid_map: std::collections::HashMap<_, _> = EXTERNAL_DOCUMENTS_IDS + .iter() + .map(|id| (docid_map.get(&rtxn, id).unwrap().unwrap(), id)) + .collect(); internal_ids.iter().map(|id| docid_map.get(id).unwrap().to_string()).collect() } From 54d07a8da3854a99263c6c74096d09fd139d5f20 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 30 Oct 2023 14:47:51 +0100 Subject: [PATCH 072/127] Update field distribution taking into account both deletions and additions --- milli/src/update/index_documents/transform.rs | 65 ++++++++++++++----- 1 file changed, 50 insertions(+), 15 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 98079e07b..05940822a 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -1,5 +1,6 @@ use std::borrow::Cow; -use std::collections::hash_map::Entry; +use std::collections::btree_map::Entry as BEntry; +use std::collections::hash_map::Entry as HEntry; use std::collections::{HashMap, HashSet}; use std::fs::File; use std::io::{Read, Seek}; @@ -20,7 +21,7 @@ use super::{IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::{db_name, main_key}; -use crate::update::del_add::into_del_add_obkv; +use crate::update::del_add::{into_del_add_obkv, DelAdd, KvReaderDelAdd}; use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep}; use crate::{ FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, BEU32, @@ -219,8 +220,8 @@ impl<'a, 'i> Transform<'a, 'i> { let mut original_docid = None; let docid = match self.new_external_documents_ids_builder.entry((*external_id).into()) { - Entry::Occupied(entry) => *entry.get() as u32, - Entry::Vacant(entry) => { + HEntry::Occupied(entry) => *entry.get() as u32, + HEntry::Vacant(entry) => { let docid = match external_documents_ids.get(wtxn, entry.key())? { Some(docid) => { // If it was already in the list of replaced documents it means it was deleted @@ -388,7 +389,7 @@ impl<'a, 'i> Transform<'a, 'i> { .entry((*to_remove).into()) { // if the document was added in a previous iteration of the transform we make it as deleted in the sorters. - Entry::Occupied(entry) => { + HEntry::Occupied(entry) => { let doc_id = *entry.get() as u32; document_sorter_buffer.clear(); document_sorter_buffer.push(Operation::Deletion as u8); @@ -405,7 +406,7 @@ impl<'a, 'i> Transform<'a, 'i> { entry.remove_entry(); true } - Entry::Vacant(_) => false, + HEntry::Vacant(_) => false, }; // If the document was already in the db we mark it as a `to_delete` document. @@ -657,8 +658,6 @@ impl<'a, 'i> Transform<'a, 'i> { // 2. Add all the new documents to the field distribution let mut field_distribution = self.index.field_distribution(wtxn)?; - self.remove_deleted_documents_from_field_distribution(wtxn, &mut field_distribution)?; - // Here we are going to do the document count + field distribution + `write_into_stream_writer` let mut iter = self.original_sorter.into_stream_merger_iter()?; // used only for the callback @@ -678,13 +677,49 @@ impl<'a, 'i> Transform<'a, 'i> { // We increment all the field of the current document in the field distribution. let obkv = KvReader::new(val); - for (key, _) in obkv.iter() { - let name = - self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId { - field_id: key, - process: "Computing field distribution in transform.", - })?; - *field_distribution.entry(name.to_string()).or_insert(0) += 1; + for (key, value) in obkv.iter() { + let reader = KvReaderDelAdd::new(value); + match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) { + (None, None) => {} + (None, Some(_)) => { + // New field + let name = self.fields_ids_map.name(key).ok_or( + FieldIdMapMissingEntry::FieldId { + field_id: key, + process: "Computing field distribution in transform.", + }, + )?; + *field_distribution.entry(name.to_string()).or_insert(0) += 1; + } + (Some(_), None) => { + // Field removed + let name = self.fields_ids_map.name(key).ok_or( + FieldIdMapMissingEntry::FieldId { + field_id: key, + process: "Computing field distribution in transform.", + }, + )?; + match field_distribution.entry(name.to_string()) { + BEntry::Vacant(_) => { /* Bug? trying to remove a non-existing field */ + } + BEntry::Occupied(mut entry) => { + // attempt to remove one + match entry.get_mut().checked_sub(1) { + Some(new_val) => { + *entry.get_mut() = new_val; + } + None => { + // was 0, remove field from distribution + entry.remove(); + } + } + } + } + } + (Some(_), Some(_)) => { + // Value change, no field distribution change + } + } } writer.insert(key, val)?; } From 9fedd8101aaa380f68738778994f603ddb75de2b Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 30 Oct 2023 14:48:41 +0100 Subject: [PATCH 073/127] Fix tests --- .../tests/documents/delete_documents.rs | 2 +- milli/src/index.rs | 22 +++--- .../documents_ids.snap | 4 ++ .../facet_id_exists_docids.snap | 4 ++ .../word_docids.snap | 4 ++ .../word_pair_proximity_docids.snap | 4 ++ .../documents_ids.snap | 4 ++ .../word_docids.snap | 5 ++ .../word_pair_proximity_docids.snap | 4 ++ .../facet_id_exists_docids.snap | 6 ++ .../facet_id_f64_docids.snap | 53 +++++++++++++++ .../facet_id_string_docids.snap | 4 ++ .../updated/word_docids.snap | 68 +++++++++---------- 13 files changed, 136 insertions(+), 48 deletions(-) create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/documents_ids.snap create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/facet_id_exists_docids.snap create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/word_docids.snap create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/word_pair_proximity_docids.snap create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/documents_ids.snap create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/word_docids.snap create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/word_pair_proximity_docids.snap create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents_/facet_id_exists_docids.snap create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_string_docids.snap diff --git a/meilisearch/tests/documents/delete_documents.rs b/meilisearch/tests/documents/delete_documents.rs index b3f04aea0..5a15e95ff 100644 --- a/meilisearch/tests/documents/delete_documents.rs +++ b/meilisearch/tests/documents/delete_documents.rs @@ -397,7 +397,7 @@ async fn delete_document_by_complex_filter() { "canceledBy": null, "details": { "providedIds": 0, - "deletedDocuments": 4, + "deletedDocuments": 2, "originalFilter": "[[\"color = green\",\"color NOT EXISTS\"]]" }, "error": null, diff --git a/milli/src/index.rs b/milli/src/index.rs index f8a37fb2b..27ad72fad 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1946,14 +1946,14 @@ pub(crate) mod tests { 3 3 "###); db_snap!(index, facet_id_f64_docids, 3, @r###" - 0 0 0 1 [4, ] - 0 0 1 1 [5, ] - 0 0 2 1 [6, ] - 0 0 3 1 [7, ] - 1 0 1 1 [4, ] - 1 0 2 1 [5, ] - 1 0 3 1 [6, ] - 1 0 4 1 [7, ] + 0 0 0 1 [0, ] + 0 0 1 1 [1, ] + 0 0 2 1 [2, ] + 0 0 3 1 [3, ] + 1 0 1 1 [0, ] + 1 0 2 1 [1, ] + 1 0 3 1 [2, ] + 1 0 4 1 [3, ] "###); } @@ -2038,9 +2038,9 @@ pub(crate) mod tests { 3 3 "###); db_snap!(index, facet_id_f64_docids, 1, @r###" - 1 0 0 1 [0, 4, ] - 1 0 1 1 [1, 5, ] - 1 0 2 1 [2, 6, ] + 1 0 0 1 [0, ] + 1 0 1 1 [1, ] + 1 0 2 1 [2, ] 1 0 3 1 [3, ] "###); } diff --git a/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/documents_ids.snap b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/documents_ids.snap new file mode 100644 index 000000000..8b27dcb0d --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- +[] diff --git a/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/facet_id_exists_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/facet_id_exists_docids.snap new file mode 100644 index 000000000..cdff1a607 --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/facet_id_exists_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- + diff --git a/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/word_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/word_docids.snap new file mode 100644 index 000000000..cdff1a607 --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/word_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- + diff --git a/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/word_pair_proximity_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/word_pair_proximity_docids.snap new file mode 100644 index 000000000..cdff1a607 --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/word_pair_proximity_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- + diff --git a/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/documents_ids.snap b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/documents_ids.snap new file mode 100644 index 000000000..8a9805f8d --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- +[2, ] diff --git a/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/word_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/word_docids.snap new file mode 100644 index 000000000..bb2f64873 --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/word_docids.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- +benoit [2, ] + diff --git a/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/word_pair_proximity_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/word_pair_proximity_docids.snap new file mode 100644 index 000000000..cdff1a607 --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/word_pair_proximity_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- + diff --git a/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents_/facet_id_exists_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents_/facet_id_exists_docids.snap new file mode 100644 index 000000000..ed120bf02 --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents_/facet_id_exists_docids.snap @@ -0,0 +1,6 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- +1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ] +2 [21, ] + diff --git a/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap new file mode 100644 index 000000000..5d6009823 --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap @@ -0,0 +1,53 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- +3 0 48.9021 1 [19, ] +3 0 49.4449 1 [] +3 0 49.9314 1 [17, ] +3 0 50.1112 1 [] +3 0 50.1793 1 [15, ] +3 0 50.2844 1 [14, ] +3 0 50.3518 1 [13, ] +3 0 50.4095 1 [] +3 0 50.4502 1 [12, ] +3 0 50.6053 1 [8, ] +3 0 50.6224 1 [3, ] +3 0 50.6299 1 [0, ] +3 0 50.6312 1 [2, ] +3 0 50.6415 1 [1, ] +3 0 50.6552 1 [] +3 0 50.6924 1 [] +3 0 50.7263 1 [] +3 0 50.7453 1 [7, ] +3 0 50.8466 1 [10, ] +3 0 51.0537 1 [9, ] +3 1 48.9021 4 [17, 19, ] +3 1 50.1793 4 [13, 14, 15, ] +3 1 50.4502 4 [0, 3, 8, 12, ] +3 1 50.6312 4 [1, 2, ] +3 1 50.7263 4 [7, 9, 10, ] +4 0 2.271 1 [17, ] +4 0 2.3708 1 [19, ] +4 0 2.7637 1 [14, ] +4 0 2.7913 1 [] +4 0 2.8547 1 [] +4 0 3.0569 1 [0, ] +4 0 3.1106 1 [1, 2, ] +4 0 3.1476 1 [3, ] +4 0 3.1541 1 [] +4 0 3.1763 1 [] +4 0 3.1897 1 [] +4 0 3.2189 1 [15, ] +4 0 3.2206 1 [7, ] +4 0 3.3758 1 [8, ] +4 0 3.5326 1 [13, ] +4 0 3.6957 1 [9, ] +4 0 3.9623 1 [12, ] +4 0 4.337 1 [10, ] +4 0 4.4347 1 [] +4 1 2.271 4 [14, 17, 19, ] +4 1 2.8547 4 [0, 1, 2, 3, ] +4 1 3.1541 4 [15, ] +4 1 3.2206 4 [7, 8, 9, 13, ] +4 1 3.9623 3 [10, 12, ] + diff --git a/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_string_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_string_docids.snap new file mode 100644 index 000000000..cdff1a607 --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_string_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- + diff --git a/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap index b0ef38b93..80dbce9e8 100644 --- a/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap +++ b/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap @@ -1,60 +1,56 @@ --- source: milli/src/update/index_documents/mod.rs --- -0 [1, 7, ] +0 [1, ] 1 [2, ] -10 [1, 7, ] -12 [0, 8, ] +10 [1, ] +12 [0, ] 1344 [3, ] -1813 [8, ] -2 [0, 8, ] +1813 [0, ] +2 [0, ] 23 [5, ] 25 [2, ] -3 [0, 8, ] +3 [0, ] 35 [5, ] -4 [4, 6, ] -42 [0, 5, 8, ] -456 [1, 7, ] -5 [0, 8, ] +4 [4, ] +42 [0, 5, ] +456 [1, ] +5 [0, ] 99 [2, ] adams [5, ] -adventure [1, 7, ] +adventure [1, ] alice [2, ] -and [0, 4, 6, 8, ] -antoine [1, 7, ] -austen [8, ] -austin [0, ] -blood [4, 6, ] +and [0, 4, ] +antoine [1, ] +austen [0, ] +blood [4, ] carroll [2, ] -de [1, 7, ] +de [1, ] douglas [5, ] -exupery [1, 7, ] -fantasy [2, 3, 4, 6, ] +exupery [1, ] +fantasy [2, 3, 4, ] galaxy [5, ] guide [5, ] -half [4, 6, ] -harry [4, 6, ] +half [4, ] +harry [4, ] hitchhiker [5, ] hobbit [3, ] in [2, ] -j [3, 4, 6, 8, ] -jane [0, ] -k [4, 6, ] -le [1, ] +j [0, 3, 4, ] +k [4, ] lewis [2, ] -little [7, ] -petit [1, ] -potter [4, 6, ] -prejudice [0, 8, ] -pride [0, 8, ] -prince [1, 4, 7, ] -princess [6, ] +little [1, ] +potter [4, ] +prejudice [0, ] +pride [0, ] +prince [1, ] +princess [4, ] r [3, ] -romance [0, 8, ] -rowling [4, 6, ] +romance [0, ] +rowling [4, ] s [5, ] -saint [1, 7, ] -the [3, 4, 5, 6, 7, ] +saint [1, ] +the [1, 3, 4, 5, ] to [5, ] tolkien [3, ] wonderland [2, ] From be395c7944204d90e9fd663f8bc5d01f1855be50 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 30 Oct 2023 16:26:29 +0100 Subject: [PATCH 074/127] Change order of arguments to tokenizer_builder --- .../index_documents/extract/extract_docid_word_positions.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index e5d95cbdb..96156adb4 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -56,7 +56,7 @@ pub fn extract_docid_word_positions( let mut value_buffer = Vec::new(); // initialize tokenizer. - let mut builder = tokenizer_builder(stop_words, dictionary, allowed_separators, None); + let mut builder = tokenizer_builder(stop_words, allowed_separators, dictionary, None); let tokenizer = builder.build(); // iterate over documents. @@ -247,8 +247,8 @@ fn lang_safe_tokens_from_document<'a>( // build a new temporary tokenizer including the allow list. let mut builder = tokenizer_builder( stop_words, - dictionary, allowed_separators, + dictionary, Some(&script_language), ); let tokenizer = builder.build(); From de10f20732accd83b096a6b0dea5121673bf4ab4 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 30 Oct 2023 16:57:08 +0100 Subject: [PATCH 075/127] Fix field distribution again --- milli/src/update/index_documents/transform.rs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 05940822a..840bade2e 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -674,10 +674,7 @@ impl<'a, 'i> Transform<'a, 'i> { total_documents: self.documents_count, }); - // We increment all the field of the current document in the field distribution. - let obkv = KvReader::new(val); - - for (key, value) in obkv.iter() { + for (key, value) in KvReader::new(val) { let reader = KvReaderDelAdd::new(value); match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) { (None, None) => {} @@ -705,12 +702,14 @@ impl<'a, 'i> Transform<'a, 'i> { BEntry::Occupied(mut entry) => { // attempt to remove one match entry.get_mut().checked_sub(1) { + Some(0) => { + entry.remove(); + } Some(new_val) => { *entry.get_mut() = new_val; } None => { - // was 0, remove field from distribution - entry.remove(); + unreachable!("Attempting to remove a field that wasn't in the field distribution") } } } From 4e91707a064f2ab0c5b5cf2baab425f15e2a915e Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 31 Oct 2023 09:41:17 +0100 Subject: [PATCH 076/127] Rename test --- milli/src/update/index_documents/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 3026ce81c..ad2f63beb 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -2750,7 +2750,7 @@ mod tests { } #[test] - fn filtered_placeholder_search_should_not_return_deleted_documents_() { + fn filtered_placeholder_search_should_not_return_deleted_documents() { let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); From dad78cbf8de82434b00621a4a3693b32a33c2a70 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 31 Oct 2023 09:53:55 +0100 Subject: [PATCH 077/127] Bulk facet remove deletes keys from DB when value empty --- milli/src/update/facet/bulk.rs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index c0b159e57..5626a4aae 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -3,7 +3,7 @@ use std::io::BufReader; use grenad::CompressionType; use heed::types::ByteSlice; -use heed::{BytesEncode, Error, RoTxn, RwTxn}; +use heed::{BytesDecode, BytesEncode, Error, RoTxn, RwTxn}; use roaring::RoaringBitmap; use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; @@ -14,7 +14,7 @@ use crate::heed_codec::facet::{ use crate::heed_codec::ByteSliceRefCodec; use crate::update::del_add::{DelAdd, KvReaderDelAdd}; use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader}; -use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; +use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldId, Index, Result}; /// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases /// by rebuilding the database "from scratch". @@ -181,7 +181,13 @@ impl FacetsUpdateBulkInner { buffer.extend_from_slice(value); } }; - database.put(wtxn, key, &buffer)?; + let new_bitmap = &buffer[1..]; + // if the new bitmap is empty, let's remove it + if CboRoaringBitmapLenCodec::bytes_decode(new_bitmap).unwrap_or_default() == 0 { + database.delete(wtxn, key)?; + } else { + database.put(wtxn, key, &buffer)?; + } } } Ok(()) From 9d59e8011ace80b403b290cef29abf0d84f66835 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 31 Oct 2023 10:08:36 +0100 Subject: [PATCH 078/127] fix some tests --- milli/src/index.rs | 16 ++++++-------- .../facet_id_f64_docids.snap | 22 ------------------- 2 files changed, 7 insertions(+), 31 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 27ad72fad..f7450a672 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1905,10 +1905,9 @@ pub(crate) mod tests { 3 3 "###); db_snap!(index, facet_id_f64_docids, 2, @r###" - 1 0 0 1 [0, ] - 1 0 1 1 [1, 4, ] - 1 0 2 1 [2, 5, ] - 1 0 3 1 [3, 6, ] + 1 0 1 1 [0, ] + 1 0 2 1 [1, ] + 1 0 3 1 [2, 3, ] "###); index @@ -1924,11 +1923,10 @@ pub(crate) mod tests { 3 3 "###); db_snap!(index, facet_id_f64_docids, 3, @r###" - 1 0 0 1 [0, ] - 1 0 1 1 [1, 4, ] - 1 0 2 1 [2, 5, ] - 1 0 3 1 [3, 6, ] - 1 0 4 1 [7, ] + 1 0 1 1 [0, ] + 1 0 2 1 [1, ] + 1 0 3 1 [2, ] + 1 0 4 1 [3, ] "###); index diff --git a/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap index 5d6009823..c45c350e7 100644 --- a/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap +++ b/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap @@ -2,41 +2,25 @@ source: milli/src/update/index_documents/mod.rs --- 3 0 48.9021 1 [19, ] -3 0 49.4449 1 [] 3 0 49.9314 1 [17, ] -3 0 50.1112 1 [] 3 0 50.1793 1 [15, ] 3 0 50.2844 1 [14, ] 3 0 50.3518 1 [13, ] -3 0 50.4095 1 [] 3 0 50.4502 1 [12, ] 3 0 50.6053 1 [8, ] 3 0 50.6224 1 [3, ] 3 0 50.6299 1 [0, ] 3 0 50.6312 1 [2, ] 3 0 50.6415 1 [1, ] -3 0 50.6552 1 [] -3 0 50.6924 1 [] -3 0 50.7263 1 [] 3 0 50.7453 1 [7, ] 3 0 50.8466 1 [10, ] 3 0 51.0537 1 [9, ] -3 1 48.9021 4 [17, 19, ] -3 1 50.1793 4 [13, 14, 15, ] -3 1 50.4502 4 [0, 3, 8, 12, ] -3 1 50.6312 4 [1, 2, ] -3 1 50.7263 4 [7, 9, 10, ] 4 0 2.271 1 [17, ] 4 0 2.3708 1 [19, ] 4 0 2.7637 1 [14, ] -4 0 2.7913 1 [] -4 0 2.8547 1 [] 4 0 3.0569 1 [0, ] 4 0 3.1106 1 [1, 2, ] 4 0 3.1476 1 [3, ] -4 0 3.1541 1 [] -4 0 3.1763 1 [] -4 0 3.1897 1 [] 4 0 3.2189 1 [15, ] 4 0 3.2206 1 [7, ] 4 0 3.3758 1 [8, ] @@ -44,10 +28,4 @@ source: milli/src/update/index_documents/mod.rs 4 0 3.6957 1 [9, ] 4 0 3.9623 1 [12, ] 4 0 4.337 1 [10, ] -4 0 4.4347 1 [] -4 1 2.271 4 [14, 17, 19, ] -4 1 2.8547 4 [0, 1, 2, 3, ] -4 1 3.1541 4 [15, ] -4 1 3.2206 4 [7, 8, 9, 13, ] -4 1 3.9623 3 [10, 12, ] From d8bf3f3fc2fc3511ceb7d2d15dc445766c597b76 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 31 Oct 2023 10:12:49 +0100 Subject: [PATCH 079/127] Remove unused snapshots --- ...dump__reader__test__import_dump_v1-11.snap | 24 ------- .../dump__reader__test__import_dump_v1-5.snap | 38 ----------- .../dump__reader__test__import_dump_v1-8.snap | 31 --------- .../bulk.rs/insert/default.hash.snap | 4 -- .../large_group_small_min_level.hash.snap | 4 -- .../insert/odd_group_odd_min_level.hash.snap | 4 -- .../small_group_large_min_level.hash.snap | 4 -- .../small_group_small_min_level.hash.snap | 4 -- .../default.hash.snap | 4 -- .../large_group_small_min_level.hash.snap | 4 -- .../odd_group_odd_min_level.hash.snap | 4 -- .../small_group_large_min_level.hash.snap | 4 -- .../small_group_small_min_level.hash.snap | 4 -- .../bulk.rs/insert_string/default.hash.snap | 4 -- .../large_group_small_min_level.hash.snap | 4 -- .../odd_group_odd_min_level.hash.snap | 4 -- .../small_group_large_min_level.hash.snap | 4 -- .../small_group_small_min_level.hash.snap | 4 -- .../facet_id_exists_docids.snap | 6 -- .../prefix_word_pair_proximity_docids.snap | 20 ------ .../word_prefix_pair_proximity_docids.snap | 23 ------- .../prefix_word_pair_proximity_docids.snap | 29 --------- .../update/word_pair_proximity_docids.snap | 33 ---------- .../word_prefix_pair_proximity_docids.snap | 31 --------- .../prefix_word_pair_proximity_docids.snap | 4 -- .../word_pair_proximity_docids.snap | 8 --- .../word_prefix_pair_proximity_docids.snap | 7 -- .../first_delete/documents_ids.snap | 4 -- .../prefix_word_pair_proximity_docids.snap | 6 -- .../first_delete/word_docids.snap | 60 ----------------- .../word_prefix_pair_proximity_docids.snap | 10 --- .../initial/documents_ids.snap | 4 -- .../prefix_word_pair_proximity_docids.snap | 14 ---- .../initial/word_docids.snap | 65 ------------------- .../word_prefix_pair_proximity_docids.snap | 15 ----- .../reupdate/documents_ids.snap | 4 -- .../prefix_word_pair_proximity_docids.snap | 6 -- .../reupdate/word_docids.snap | 60 ----------------- .../word_prefix_pair_proximity_docids.snap | 5 -- .../second_delete/documents_ids.snap | 4 -- .../prefix_word_pair_proximity_docids.snap | 6 -- .../second_delete/word_docids.snap | 10 --- .../word_prefix_pair_proximity_docids.snap | 10 --- .../initial/documents_ids.snap | 4 -- .../prefix_word_pair_proximity_docids.snap | 9 --- .../initial/word_docids.snap | 61 ----------------- .../word_prefix_pair_proximity_docids.snap | 7 -- .../replaced/documents_ids.snap | 4 -- .../prefix_word_pair_proximity_docids.snap | 5 -- .../replaced/word_docids.snap | 61 ----------------- .../word_prefix_pair_proximity_docids.snap | 5 -- .../initial/documents_ids.snap | 4 -- .../prefix_word_pair_proximity_docids.snap | 9 --- .../initial/word_docids.snap | 61 ----------------- .../word_prefix_pair_proximity_docids.snap | 7 -- .../replaced/documents_ids.snap | 4 -- .../prefix_word_pair_proximity_docids.snap | 10 --- .../replaced/word_docids.hash.snap | 4 -- .../word_prefix_pair_proximity_docids.snap | 8 --- .../first_delete/documents_ids.snap | 4 -- .../prefix_word_pair_proximity_docids.snap | 14 ---- .../first_delete/word_docids.snap | 65 ------------------- .../word_prefix_pair_proximity_docids.snap | 15 ----- .../initial/documents_ids.snap | 4 -- .../prefix_word_pair_proximity_docids.snap | 14 ---- .../initial/word_docids.snap | 65 ------------------- .../word_prefix_pair_proximity_docids.snap | 15 ----- .../reupdate/documents_ids.snap | 4 -- .../prefix_word_pair_proximity_docids.snap | 17 ----- .../reupdate/word_docids.hash.snap | 4 -- .../word_prefix_pair_proximity_docids.snap | 19 ------ .../second_delete/documents_ids.snap | 4 -- .../prefix_word_pair_proximity_docids.snap | 14 ---- .../second_delete/word_docids.snap | 65 ------------------- .../word_prefix_pair_proximity_docids.snap | 15 ----- .../always_hard/documents_ids.snap | 4 -- .../always_hard/facet_id_exists_docids.snap | 4 -- .../soft_deleted_documents_ids.snap | 4 -- .../always_hard/word_docids.snap | 4 -- .../word_pair_proximity_docids.snap | 4 -- .../always_soft/documents_ids.snap | 4 -- .../always_soft/facet_id_exists_docids.snap | 4 -- .../soft_deleted_documents_ids.snap | 4 -- .../always_soft/word_docids.snap | 4 -- .../word_pair_proximity_docids.snap | 4 -- .../always_hard/documents_ids.snap | 4 -- .../soft_deleted_documents_ids.snap | 4 -- .../always_hard/word_docids.snap | 5 -- .../word_pair_proximity_docids.snap | 4 -- .../always_soft/documents_ids.snap | 4 -- .../soft_deleted_documents_ids.snap | 4 -- .../always_soft/word_docids.snap | 7 -- .../word_pair_proximity_docids.snap | 4 -- .../always_hard/facet_id_exists_docids.snap | 6 -- .../always_hard/facet_id_f64_docids.snap | 5 -- .../always_hard/facet_id_string_docids.snap | 17 ----- .../soft_deleted_documents_ids.snap | 4 -- .../always_hard/word_docids.snap | 38 ----------- .../word_pair_proximity_docids.snap | 25 ------- .../always_soft/facet_id_exists_docids.snap | 6 -- .../always_soft/facet_id_f64_docids.snap | 6 -- .../always_soft/facet_id_string_docids.snap | 19 ------ .../soft_deleted_documents_ids.snap | 4 -- .../always_soft/word_docids.snap | 42 ------------ .../word_pair_proximity_docids.snap | 29 --------- .../always_hard/facet_id_f64_docids.snap | 31 --------- .../always_hard/facet_id_string_docids.snap | 4 -- .../soft_deleted_documents_ids.snap | 4 -- .../always_soft/facet_id_f64_docids.snap | 53 --------------- .../always_soft/facet_id_string_docids.snap | 4 -- .../soft_deleted_documents_ids.snap | 4 -- .../soft_deleted_documents_ids.snap | 4 -- .../soft_deleted_documents_ids.snap | 4 -- .../soft_deleted_documents_ids.snap | 4 -- .../soft_deleted_documents_ids.snap | 4 -- .../soft_deleted_documents_ids.snap | 4 -- .../soft_deleted_documents_ids.snap | 4 -- 117 files changed, 1599 deletions(-) delete mode 100644 dump/src/reader/snapshots/dump__reader__test__import_dump_v1-11.snap delete mode 100644 dump/src/reader/snapshots/dump__reader__test__import_dump_v1-5.snap delete mode 100644 dump/src/reader/snapshots/dump__reader__test__import_dump_v1-8.snap delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert/default.hash.snap delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert/large_group_small_min_level.hash.snap delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert/odd_group_odd_min_level.hash.snap delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert/small_group_large_min_level.hash.snap delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_string/default.hash.snap delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_string/large_group_small_min_level.hash.snap delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_string/odd_group_odd_min_level.hash.snap delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_large_min_level.hash.snap delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_small_min_level.hash.snap delete mode 100644 milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents_/facet_id_exists_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/prefix_word_pair_proximity_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/word_prefix_pair_proximity_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/prefix_word_pair_proximity_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_pair_proximity_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_prefix_pair_proximity_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/prefix_word_pair_proximity_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_pair_proximity_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_prefix_pair_proximity_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/documents_ids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/documents_ids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/documents_ids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/documents_ids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/documents_ids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/prefix_word_pair_proximity_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_prefix_pair_proximity_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/documents_ids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/prefix_word_pair_proximity_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_prefix_pair_proximity_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/documents_ids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/prefix_word_pair_proximity_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_prefix_pair_proximity_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/documents_ids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/prefix_word_pair_proximity_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_docids.hash.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_prefix_pair_proximity_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/documents_ids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/documents_ids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/documents_ids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_docids.hash.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/documents_ids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/documents_ids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/facet_id_exists_docids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/soft_deleted_documents_ids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_docids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_pair_proximity_docids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/documents_ids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/facet_id_exists_docids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/soft_deleted_documents_ids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_docids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_pair_proximity_docids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/documents_ids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/soft_deleted_documents_ids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_docids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_pair_proximity_docids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/documents_ids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/soft_deleted_documents_ids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_docids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_pair_proximity_docids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_exists_docids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_docids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_pair_proximity_docids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_exists_docids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_docids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_pair_proximity_docids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap diff --git a/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-11.snap b/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-11.snap deleted file mode 100644 index 92fc61d72..000000000 --- a/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-11.snap +++ /dev/null @@ -1,24 +0,0 @@ ---- -source: dump/src/reader/mod.rs -expression: spells.settings().unwrap() ---- -{ - "displayedAttributes": [ - "*" - ], - "searchableAttributes": [ - "*" - ], - "filterableAttributes": [], - "sortableAttributes": [], - "rankingRules": [ - "typo", - "words", - "proximity", - "attribute", - "exactness" - ], - "stopWords": [], - "synonyms": {}, - "distinctAttribute": null -} diff --git a/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-5.snap b/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-5.snap deleted file mode 100644 index b0b54c136..000000000 --- a/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-5.snap +++ /dev/null @@ -1,38 +0,0 @@ ---- -source: dump/src/reader/mod.rs -expression: products.settings().unwrap() ---- -{ - "displayedAttributes": [ - "*" - ], - "searchableAttributes": [ - "*" - ], - "filterableAttributes": [], - "sortableAttributes": [], - "rankingRules": [ - "typo", - "words", - "proximity", - "attribute", - "exactness" - ], - "stopWords": [], - "synonyms": { - "android": [ - "phone", - "smartphone" - ], - "iphone": [ - "phone", - "smartphone" - ], - "phone": [ - "android", - "iphone", - "smartphone" - ] - }, - "distinctAttribute": null -} diff --git a/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-8.snap b/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-8.snap deleted file mode 100644 index 5c12a0438..000000000 --- a/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-8.snap +++ /dev/null @@ -1,31 +0,0 @@ ---- -source: dump/src/reader/mod.rs -expression: movies.settings().unwrap() ---- -{ - "displayedAttributes": [ - "*" - ], - "searchableAttributes": [ - "*" - ], - "filterableAttributes": [ - "genres", - "id" - ], - "sortableAttributes": [ - "genres", - "id" - ], - "rankingRules": [ - "typo", - "words", - "proximity", - "attribute", - "exactness", - "release_date:asc" - ], - "stopWords": [], - "synonyms": {}, - "distinctAttribute": null -} diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert/default.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/default.hash.snap deleted file mode 100644 index bef20823c..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/insert/default.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -b40dd31a65e033ffc6b35c027ce19506 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert/large_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/large_group_small_min_level.hash.snap deleted file mode 100644 index 74c40e6a3..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/insert/large_group_small_min_level.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -7ee22d8e9387e72758f00918eb67e4c6 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert/odd_group_odd_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/odd_group_odd_min_level.hash.snap deleted file mode 100644 index 6fb086d35..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/insert/odd_group_odd_min_level.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -60f567359382507afdaf45fb075740c3 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_large_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_large_min_level.hash.snap deleted file mode 100644 index 0271a6c6b..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_large_min_level.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -b986d6e6cbf425685f409a8b417010e1 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap deleted file mode 100644 index d801ef19f..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -ee10dd2ae2b5c6621a89a5d0a9aa8ccc diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap deleted file mode 100644 index e9988f527..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -fa877559eef78b383b496c15a364a2dc diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap deleted file mode 100644 index aa52901da..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -16a96353bc42f2ff3e91611ca4d5b184 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap deleted file mode 100644 index 64f5012a4..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -be1b08073b9d9788d18080c1320151d7 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap deleted file mode 100644 index aa52901da..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -16a96353bc42f2ff3e91611ca4d5b184 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap deleted file mode 100644 index bb0e9aa69..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -32a45d555df2e001420fea149818d376 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/default.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/default.hash.snap deleted file mode 100644 index b7705b72e..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/insert_string/default.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -353d70f52eea66e5031dca989ea8a037 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/large_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/large_group_small_min_level.hash.snap deleted file mode 100644 index 15030a1ea..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/insert_string/large_group_small_min_level.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -52a093c909133d84023a4a7b83864808 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/odd_group_odd_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/odd_group_odd_min_level.hash.snap deleted file mode 100644 index 949ec6647..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/insert_string/odd_group_odd_min_level.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -9d86c72ddb241d0aeca2995d61a3648a diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_large_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_large_min_level.hash.snap deleted file mode 100644 index d8797f1ab..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_large_min_level.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -c0943177594534bfe5527cbf40fe388e diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_small_min_level.hash.snap deleted file mode 100644 index f7949c5f3..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_small_min_level.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -6ed86f234028ae3df5881bee5512f11e diff --git a/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents_/facet_id_exists_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents_/facet_id_exists_docids.snap deleted file mode 100644 index ed120bf02..000000000 --- a/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents_/facet_id_exists_docids.snap +++ /dev/null @@ -1,6 +0,0 @@ ---- -source: milli/src/update/index_documents/mod.rs ---- -1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ] -2 [21, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index 6609786a3..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,20 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a 5 [101, ] -1 a amazing [100, ] -1 a an [100, ] -1 a and [100, ] -1 a beautiful [100, ] -1 b house [100, ] -1 b rings [101, ] -1 be house [100, ] -1 be rings [101, ] -2 a am [101, ] -2 a amazing [100, ] -2 a and [100, ] -2 a beautiful [100, ] -2 a house [100, ] -2 b at [101, ] -2 be at [101, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 52b29e136..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,23 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 5 a [101, ] -1 amazing a [100, ] -1 an a [100, ] -1 and b [100, ] -1 and be [100, ] -1 at a [100, ] -1 rings a [101, ] -1 the b [101, ] -1 the be [101, ] -2 amazing b [100, ] -2 amazing be [100, ] -2 an a [100, ] -2 at a [100, 101, ] -2 bell a [101, ] -3 an b [100, ] -3 an be [100, ] -3 at a [100, ] -3 rings a [101, ] -3 the a [101, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index 7644c433d..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,29 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a 5 [101, ] -1 a amazing [100, ] -1 a an [100, 202, ] -1 a and [100, ] -1 a beautiful [100, ] -1 a extraordinary [202, ] -1 am and [100, ] -1 an amazing [100, ] -1 an beautiful [100, ] -1 an extraordinary [202, ] -1 b house [100, ] -1 b rings [101, ] -1 be house [100, ] -1 be rings [101, ] -2 a am [101, ] -2 a amazing [100, ] -2 a and [100, ] -2 a beautiful [100, ] -2 a extraordinary [202, ] -2 a house [100, 202, ] -2 am beautiful [100, ] -2 an and [100, ] -2 an house [100, 202, ] -2 b at [101, ] -2 be at [101, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_pair_proximity_docids.snap deleted file mode 100644 index 1b56974c2..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_pair_proximity_docids.snap +++ /dev/null @@ -1,33 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 5 am [101, ] -1 amazing and [100, ] -1 an amazing [100, ] -1 an extraordinary [202, ] -1 and beautiful [100, ] -1 at 5 [101, ] -1 at an [100, 202, ] -1 beautiful house [100, ] -1 bell rings [101, ] -1 extraordinary house [202, ] -1 rings at [101, ] -1 the bell [101, ] -2 amazing beautiful [100, ] -2 an and [100, ] -2 an house [202, ] -2 and house [100, ] -2 at am [101, ] -2 at amazing [100, ] -2 at extraordinary [202, ] -2 bell at [101, ] -2 rings 5 [101, ] -2 the rings [101, ] -3 amazing house [100, ] -3 an beautiful [100, ] -3 at and [100, ] -3 at house [202, ] -3 bell 5 [101, ] -3 rings am [101, ] -3 the at [101, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 008a4b21d..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,31 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 5 a [101, ] -1 5 am [101, ] -1 amazing a [100, ] -1 amazing an [100, ] -1 an a [100, ] -1 an am [100, ] -1 and b [100, ] -1 and be [100, ] -1 at a [100, 202, ] -1 at an [100, 202, ] -1 rings a [101, ] -1 the b [101, ] -1 the be [101, ] -2 amazing b [100, ] -2 amazing be [100, ] -2 an a [100, ] -2 an an [100, ] -2 at a [100, 101, ] -2 at am [100, 101, ] -2 bell a [101, ] -3 an b [100, ] -3 an be [100, ] -3 at a [100, ] -3 at an [100, ] -3 rings a [101, ] -3 rings am [101, ] -3 the a [101, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index d212999bb..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_pair_proximity_docids.snap deleted file mode 100644 index 816895dcf..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_pair_proximity_docids.snap +++ /dev/null @@ -1,8 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a y [51, ] -1 x a [51, ] -1 x y [50, ] -2 x y [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 03530a2f1..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,7 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a y [51, ] -1 x y [50, ] -2 x y [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/documents_ids.snap deleted file mode 100644 index 39e9fbe65..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index 61987fd4a..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,6 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a 5 [51, ] -2 a am [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_docids.snap deleted file mode 100644 index 1caf1a9a3..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_docids.snap +++ /dev/null @@ -1,60 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -5 [51, ] -a0 [0, ] -a1 [1, ] -a10 [16, ] -a11 [17, ] -a12 [18, ] -a13 [19, ] -a14 [20, ] -a15 [21, ] -a16 [22, ] -a17 [23, ] -a18 [24, ] -a19 [25, ] -a1a [26, ] -a1b [27, ] -a1c [28, ] -a1d [29, ] -a1e [30, ] -a1f [31, ] -a2 [2, ] -a20 [32, ] -a21 [33, ] -a22 [34, ] -a23 [35, ] -a24 [36, ] -a25 [37, ] -a26 [38, ] -a27 [39, ] -a28 [40, ] -a29 [41, ] -a2a [42, ] -a2b [43, ] -a2c [44, ] -a2d [45, ] -a2e [46, ] -a2f [47, ] -a3 [3, ] -a30 [48, ] -a31 [49, ] -a4 [4, ] -a5 [5, ] -a6 [6, ] -a7 [7, ] -a8 [8, ] -a9 [9, ] -aa [10, ] -ab [11, ] -ac [12, ] -ad [13, ] -ae [14, ] -af [15, ] -am [51, ] -at [51, ] -bell [51, ] -rings [51, ] -the [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 618a0b076..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,10 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 5 a [51, ] -1 rings a [51, ] -2 at a [51, ] -2 bell a [51, ] -3 rings a [51, ] -3 the a [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/documents_ids.snap deleted file mode 100644 index 78008f83b..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index b380ba9b5..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,14 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a 5 [51, ] -1 a amazing [50, ] -1 a an [50, ] -1 a and [50, ] -1 a beautiful [50, ] -2 a am [51, ] -2 a amazing [50, ] -2 a and [50, ] -2 a beautiful [50, ] -2 a house [50, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_docids.snap deleted file mode 100644 index 6b5658b74..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_docids.snap +++ /dev/null @@ -1,65 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -5 [51, ] -a0 [0, ] -a1 [1, ] -a10 [16, ] -a11 [17, ] -a12 [18, ] -a13 [19, ] -a14 [20, ] -a15 [21, ] -a16 [22, ] -a17 [23, ] -a18 [24, ] -a19 [25, ] -a1a [26, ] -a1b [27, ] -a1c [28, ] -a1d [29, ] -a1e [30, ] -a1f [31, ] -a2 [2, ] -a20 [32, ] -a21 [33, ] -a22 [34, ] -a23 [35, ] -a24 [36, ] -a25 [37, ] -a26 [38, ] -a27 [39, ] -a28 [40, ] -a29 [41, ] -a2a [42, ] -a2b [43, ] -a2c [44, ] -a2d [45, ] -a2e [46, ] -a2f [47, ] -a3 [3, ] -a30 [48, ] -a31 [49, ] -a4 [4, ] -a5 [5, ] -a6 [6, ] -a7 [7, ] -a8 [8, ] -a9 [9, ] -aa [10, ] -ab [11, ] -ac [12, ] -ad [13, ] -ae [14, ] -af [15, ] -am [51, ] -amazing [50, ] -an [50, ] -and [50, ] -at [50, 51, ] -beautiful [50, ] -bell [51, ] -house [50, ] -rings [51, ] -the [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 885985bdf..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,15 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 5 a [51, ] -1 amazing a [50, ] -1 an a [50, ] -1 at a [50, ] -1 rings a [51, ] -2 an a [50, ] -2 at a [50, 51, ] -2 bell a [51, ] -3 at a [50, ] -3 rings a [51, ] -3 the a [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/documents_ids.snap deleted file mode 100644 index 39e9fbe65..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index 267a1c01d..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,6 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 b rings [51, ] -2 b at [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_docids.snap deleted file mode 100644 index e5336d58c..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_docids.snap +++ /dev/null @@ -1,60 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -5 [51, ] -am [51, ] -at [51, ] -b0 [0, ] -b1 [1, ] -b10 [16, ] -b11 [17, ] -b12 [18, ] -b13 [19, ] -b14 [20, ] -b15 [21, ] -b16 [22, ] -b17 [23, ] -b18 [24, ] -b19 [25, ] -b1a [26, ] -b1b [27, ] -b1c [28, ] -b1d [29, ] -b1e [30, ] -b1f [31, ] -b2 [2, ] -b20 [32, ] -b21 [33, ] -b22 [34, ] -b23 [35, ] -b24 [36, ] -b25 [37, ] -b26 [38, ] -b27 [39, ] -b28 [40, ] -b29 [41, ] -b2a [42, ] -b2b [43, ] -b2c [44, ] -b2d [45, ] -b2e [46, ] -b2f [47, ] -b3 [3, ] -b30 [48, ] -b31 [49, ] -b4 [4, ] -b5 [5, ] -b6 [6, ] -b7 [7, ] -b8 [8, ] -b9 [9, ] -ba [10, ] -bb [11, ] -bc [12, ] -bd [13, ] -be [14, ] -bell [51, ] -bf [15, ] -rings [51, ] -the [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 4cdf756ac..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,5 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 the b [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/documents_ids.snap deleted file mode 100644 index 4dca775e6..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index 61987fd4a..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,6 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a 5 [51, ] -2 a am [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_docids.snap deleted file mode 100644 index 7949d464e..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_docids.snap +++ /dev/null @@ -1,10 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -5 [51, ] -am [51, ] -at [51, ] -bell [51, ] -rings [51, ] -the [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 618a0b076..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,10 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 5 a [51, ] -1 rings a [51, ] -2 at a [51, ] -2 bell a [51, ] -3 rings a [51, ] -3 the a [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/documents_ids.snap deleted file mode 100644 index 78008f83b..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index 78b6a3885..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,9 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a amazing [50, ] -1 a an [50, ] -1 a house [50, ] -2 a amazing [50, ] -2 a house [50, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_docids.snap deleted file mode 100644 index 8c7809973..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_docids.snap +++ /dev/null @@ -1,61 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -a0 [0, ] -a1 [1, ] -a10 [16, ] -a11 [17, ] -a12 [18, ] -a13 [19, ] -a14 [20, ] -a15 [21, ] -a16 [22, ] -a17 [23, ] -a18 [24, ] -a19 [25, ] -a1a [26, ] -a1b [27, ] -a1c [28, ] -a1d [29, ] -a1e [30, ] -a1f [31, ] -a2 [2, ] -a20 [32, ] -a21 [33, ] -a22 [34, ] -a23 [35, ] -a24 [36, ] -a25 [37, ] -a26 [38, ] -a27 [39, ] -a28 [40, ] -a29 [41, ] -a2a [42, ] -a2b [43, ] -a2c [44, ] -a2d [45, ] -a2e [46, ] -a2f [47, ] -a3 [3, ] -a30 [48, ] -a31 [49, ] -a4 [4, ] -a5 [5, ] -a6 [6, ] -a7 [7, ] -a8 [8, ] -a9 [9, ] -aa [10, ] -ab [11, ] -ac [12, ] -ad [13, ] -ae [14, ] -af [15, ] -amazing [50, ] -an [50, ] -at [50, ] -bell [51, ] -house [50, ] -rings [51, ] -the [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 65d8b806b..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,7 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 an a [50, ] -1 at a [50, ] -2 at a [50, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/documents_ids.snap deleted file mode 100644 index 775d41a3d..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index 54c9e4b9b..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,5 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 b rings [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_docids.snap deleted file mode 100644 index f86fdcb8b..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_docids.snap +++ /dev/null @@ -1,61 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -amazing [50, ] -an [50, ] -at [50, ] -b0 [52, ] -b1 [53, ] -b10 [68, ] -b11 [69, ] -b12 [70, ] -b13 [71, ] -b14 [72, ] -b15 [73, ] -b16 [74, ] -b17 [75, ] -b18 [76, ] -b19 [77, ] -b1a [78, ] -b1b [79, ] -b1c [80, ] -b1d [81, ] -b1e [82, ] -b1f [83, ] -b2 [54, ] -b20 [84, ] -b21 [85, ] -b22 [86, ] -b23 [87, ] -b24 [88, ] -b25 [89, ] -b26 [90, ] -b27 [91, ] -b28 [92, ] -b29 [93, ] -b2a [94, ] -b2b [95, ] -b2c [96, ] -b2d [97, ] -b2e [98, ] -b2f [99, ] -b3 [55, ] -b30 [100, ] -b31 [101, ] -b4 [56, ] -b5 [57, ] -b6 [58, ] -b7 [59, ] -b8 [60, ] -b9 [61, ] -ba [62, ] -bb [63, ] -bc [64, ] -bd [65, ] -be [66, ] -bell [51, ] -bf [67, ] -house [50, ] -rings [51, ] -the [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 4cdf756ac..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,5 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 the b [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/documents_ids.snap deleted file mode 100644 index 78008f83b..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index 78b6a3885..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,9 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a amazing [50, ] -1 a an [50, ] -1 a house [50, ] -2 a amazing [50, ] -2 a house [50, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_docids.snap deleted file mode 100644 index 8c7809973..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_docids.snap +++ /dev/null @@ -1,61 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -a0 [0, ] -a1 [1, ] -a10 [16, ] -a11 [17, ] -a12 [18, ] -a13 [19, ] -a14 [20, ] -a15 [21, ] -a16 [22, ] -a17 [23, ] -a18 [24, ] -a19 [25, ] -a1a [26, ] -a1b [27, ] -a1c [28, ] -a1d [29, ] -a1e [30, ] -a1f [31, ] -a2 [2, ] -a20 [32, ] -a21 [33, ] -a22 [34, ] -a23 [35, ] -a24 [36, ] -a25 [37, ] -a26 [38, ] -a27 [39, ] -a28 [40, ] -a29 [41, ] -a2a [42, ] -a2b [43, ] -a2c [44, ] -a2d [45, ] -a2e [46, ] -a2f [47, ] -a3 [3, ] -a30 [48, ] -a31 [49, ] -a4 [4, ] -a5 [5, ] -a6 [6, ] -a7 [7, ] -a8 [8, ] -a9 [9, ] -aa [10, ] -ab [11, ] -ac [12, ] -ad [13, ] -ae [14, ] -af [15, ] -amazing [50, ] -an [50, ] -at [50, ] -bell [51, ] -house [50, ] -rings [51, ] -the [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 65d8b806b..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,7 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 an a [50, ] -1 at a [50, ] -2 at a [50, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/documents_ids.snap deleted file mode 100644 index 775d41a3d..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index 0241f26a5..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,10 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a amazing [50, ] -1 a an [50, ] -1 a house [50, ] -1 b rings [51, ] -2 a amazing [50, ] -2 a house [50, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_docids.hash.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_docids.hash.snap deleted file mode 100644 index 6a481eeee..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -5f6443e54fae188aa96d4f27fce28939 diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index d20582970..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,8 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 an a [50, ] -1 at a [50, ] -1 the b [51, ] -2 at a [50, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/documents_ids.snap deleted file mode 100644 index 39e9fbe65..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index b380ba9b5..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,14 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a 5 [51, ] -1 a amazing [50, ] -1 a an [50, ] -1 a and [50, ] -1 a beautiful [50, ] -2 a am [51, ] -2 a amazing [50, ] -2 a and [50, ] -2 a beautiful [50, ] -2 a house [50, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_docids.snap deleted file mode 100644 index 6b5658b74..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_docids.snap +++ /dev/null @@ -1,65 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -5 [51, ] -a0 [0, ] -a1 [1, ] -a10 [16, ] -a11 [17, ] -a12 [18, ] -a13 [19, ] -a14 [20, ] -a15 [21, ] -a16 [22, ] -a17 [23, ] -a18 [24, ] -a19 [25, ] -a1a [26, ] -a1b [27, ] -a1c [28, ] -a1d [29, ] -a1e [30, ] -a1f [31, ] -a2 [2, ] -a20 [32, ] -a21 [33, ] -a22 [34, ] -a23 [35, ] -a24 [36, ] -a25 [37, ] -a26 [38, ] -a27 [39, ] -a28 [40, ] -a29 [41, ] -a2a [42, ] -a2b [43, ] -a2c [44, ] -a2d [45, ] -a2e [46, ] -a2f [47, ] -a3 [3, ] -a30 [48, ] -a31 [49, ] -a4 [4, ] -a5 [5, ] -a6 [6, ] -a7 [7, ] -a8 [8, ] -a9 [9, ] -aa [10, ] -ab [11, ] -ac [12, ] -ad [13, ] -ae [14, ] -af [15, ] -am [51, ] -amazing [50, ] -an [50, ] -and [50, ] -at [50, 51, ] -beautiful [50, ] -bell [51, ] -house [50, ] -rings [51, ] -the [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 885985bdf..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,15 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 5 a [51, ] -1 amazing a [50, ] -1 an a [50, ] -1 at a [50, ] -1 rings a [51, ] -2 an a [50, ] -2 at a [50, 51, ] -2 bell a [51, ] -3 at a [50, ] -3 rings a [51, ] -3 the a [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/documents_ids.snap deleted file mode 100644 index 78008f83b..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index b380ba9b5..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,14 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a 5 [51, ] -1 a amazing [50, ] -1 a an [50, ] -1 a and [50, ] -1 a beautiful [50, ] -2 a am [51, ] -2 a amazing [50, ] -2 a and [50, ] -2 a beautiful [50, ] -2 a house [50, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_docids.snap deleted file mode 100644 index 6b5658b74..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_docids.snap +++ /dev/null @@ -1,65 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -5 [51, ] -a0 [0, ] -a1 [1, ] -a10 [16, ] -a11 [17, ] -a12 [18, ] -a13 [19, ] -a14 [20, ] -a15 [21, ] -a16 [22, ] -a17 [23, ] -a18 [24, ] -a19 [25, ] -a1a [26, ] -a1b [27, ] -a1c [28, ] -a1d [29, ] -a1e [30, ] -a1f [31, ] -a2 [2, ] -a20 [32, ] -a21 [33, ] -a22 [34, ] -a23 [35, ] -a24 [36, ] -a25 [37, ] -a26 [38, ] -a27 [39, ] -a28 [40, ] -a29 [41, ] -a2a [42, ] -a2b [43, ] -a2c [44, ] -a2d [45, ] -a2e [46, ] -a2f [47, ] -a3 [3, ] -a30 [48, ] -a31 [49, ] -a4 [4, ] -a5 [5, ] -a6 [6, ] -a7 [7, ] -a8 [8, ] -a9 [9, ] -aa [10, ] -ab [11, ] -ac [12, ] -ad [13, ] -ae [14, ] -af [15, ] -am [51, ] -amazing [50, ] -an [50, ] -and [50, ] -at [50, 51, ] -beautiful [50, ] -bell [51, ] -house [50, ] -rings [51, ] -the [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 885985bdf..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,15 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 5 a [51, ] -1 amazing a [50, ] -1 an a [50, ] -1 at a [50, ] -1 rings a [51, ] -2 an a [50, ] -2 at a [50, 51, ] -2 bell a [51, ] -3 at a [50, ] -3 rings a [51, ] -3 the a [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/documents_ids.snap deleted file mode 100644 index c8a1e54b4..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index db62b6566..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,17 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a 5 [51, ] -1 a amazing [50, ] -1 a an [50, ] -1 a and [50, ] -1 a beautiful [50, ] -1 b house [50, ] -1 b rings [51, ] -2 a am [51, ] -2 a amazing [50, ] -2 a and [50, ] -2 a beautiful [50, ] -2 a house [50, ] -2 b at [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_docids.hash.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_docids.hash.snap deleted file mode 100644 index 7fd726325..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -9f4866b80177e321a33ce434992022b5 diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 2ea0d46f4..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,19 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 5 a [51, ] -1 amazing a [50, ] -1 an a [50, ] -1 and b [50, ] -1 at a [50, ] -1 rings a [51, ] -1 the b [51, ] -2 amazing b [50, ] -2 an a [50, ] -2 at a [50, 51, ] -2 bell a [51, ] -3 an b [50, ] -3 at a [50, ] -3 rings a [51, ] -3 the a [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/documents_ids.snap deleted file mode 100644 index 4dca775e6..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -[51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap deleted file mode 100644 index b380ba9b5..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap +++ /dev/null @@ -1,14 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 a 5 [51, ] -1 a amazing [50, ] -1 a an [50, ] -1 a and [50, ] -1 a beautiful [50, ] -2 a am [51, ] -2 a amazing [50, ] -2 a and [50, ] -2 a beautiful [50, ] -2 a house [50, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_docids.snap deleted file mode 100644 index 6b5658b74..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_docids.snap +++ /dev/null @@ -1,65 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -5 [51, ] -a0 [0, ] -a1 [1, ] -a10 [16, ] -a11 [17, ] -a12 [18, ] -a13 [19, ] -a14 [20, ] -a15 [21, ] -a16 [22, ] -a17 [23, ] -a18 [24, ] -a19 [25, ] -a1a [26, ] -a1b [27, ] -a1c [28, ] -a1d [29, ] -a1e [30, ] -a1f [31, ] -a2 [2, ] -a20 [32, ] -a21 [33, ] -a22 [34, ] -a23 [35, ] -a24 [36, ] -a25 [37, ] -a26 [38, ] -a27 [39, ] -a28 [40, ] -a29 [41, ] -a2a [42, ] -a2b [43, ] -a2c [44, ] -a2d [45, ] -a2e [46, ] -a2f [47, ] -a3 [3, ] -a30 [48, ] -a31 [49, ] -a4 [4, ] -a5 [5, ] -a6 [6, ] -a7 [7, ] -a8 [8, ] -a9 [9, ] -aa [10, ] -ab [11, ] -ac [12, ] -ad [13, ] -ae [14, ] -af [15, ] -am [51, ] -amazing [50, ] -an [50, ] -and [50, ] -at [50, 51, ] -beautiful [50, ] -bell [51, ] -house [50, ] -rings [51, ] -the [51, ] - diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 885985bdf..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,15 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -1 5 a [51, ] -1 amazing a [50, ] -1 an a [50, ] -1 at a [50, ] -1 rings a [51, ] -2 an a [50, ] -2 at a [50, 51, ] -2 bell a [51, ] -3 at a [50, ] -3 rings a [51, ] -3 the a [51, ] - diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/documents_ids.snap deleted file mode 100644 index e87bce206..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/facet_id_exists_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/facet_id_exists_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/soft_deleted_documents_ids.snap deleted file mode 100644 index e87bce206..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_pair_proximity_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_pair_proximity_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/documents_ids.snap deleted file mode 100644 index e87bce206..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/facet_id_exists_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/facet_id_exists_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/soft_deleted_documents_ids.snap deleted file mode 100644 index e87bce206..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_pair_proximity_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_pair_proximity_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/documents_ids.snap deleted file mode 100644 index 6d69b2ffb..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[2, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/soft_deleted_documents_ids.snap deleted file mode 100644 index e87bce206..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_docids.snap deleted file mode 100644 index 88d3a98aa..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_docids.snap +++ /dev/null @@ -1,5 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -benoit [2, ] - diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_pair_proximity_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_pair_proximity_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/documents_ids.snap deleted file mode 100644 index 6d69b2ffb..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[2, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/soft_deleted_documents_ids.snap deleted file mode 100644 index 9139b7a05..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[0, 1, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_docids.snap deleted file mode 100644 index 15c881e87..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_docids.snap +++ /dev/null @@ -1,7 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -benoit [2, ] -kevin [0, ] -kevina [1, ] - diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_pair_proximity_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_pair_proximity_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_exists_docids.snap deleted file mode 100644 index 7481b11c4..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_exists_docids.snap +++ /dev/null @@ -1,6 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ] -2 [21, ] - diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap deleted file mode 100644 index 87856f6dc..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap +++ /dev/null @@ -1,5 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -2 0 2.2 1 [21, ] - diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap deleted file mode 100644 index ab1d2175f..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap +++ /dev/null @@ -1,17 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -1 0 abstract 1 [2, 6, 10, 13, 14, 15, 16, 17, ] -1 0 aquarium 1 [5, ] -1 0 art 1 [4, 5, 8, 9, 10, 12, 17, ] -1 0 cartoon 1 [2, 7, 15, 17, ] -1 0 colorfulness 1 [13, ] -1 0 design 1 [2, 18, ] -1 0 drawing 1 [3, 4, 5, 8, 10, 11, 16, ] -1 0 geometry 1 [19, ] -1 0 letter 1 [1, ] -1 0 outdoor 1 [4, ] -1 0 painting 1 [3, ] -1 0 pattern 1 [2, 3, 9, 10, 13, 14, 16, ] -2 0 design 1 [21, ] - diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap deleted file mode 100644 index e87bce206..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[] diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_docids.snap deleted file mode 100644 index f8d64e001..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_docids.snap +++ /dev/null @@ -1,38 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, ] -2 [21, ] -36 [3, ] -37 [4, ] -38 [5, ] -39 [6, ] -40 [7, ] -41 [8, ] -42 [9, ] -43 [10, ] -44 [11, ] -45 [12, ] -46 [13, ] -47 [14, ] -5 [1, ] -52 [15, ] -57 [16, ] -58 [17, ] -68 [18, ] -69 [19, ] -7 [2, ] -71 [21, ] -abstract [2, 6, 10, 13, 14, 15, 16, 17, ] -aquarium [5, ] -art [4, 5, 8, 9, 10, 12, 17, ] -cartoon [2, 7, 15, 17, ] -colorfulness [13, ] -design [2, 18, 21, ] -drawing [3, 4, 5, 8, 10, 11, 16, ] -geometry [19, ] -letter [1, ] -outdoor [4, ] -painting [3, ] -pattern [2, 3, 9, 10, 13, 14, 16, ] - diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_pair_proximity_docids.snap deleted file mode 100644 index 36add107b..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_pair_proximity_docids.snap +++ /dev/null @@ -1,25 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -1 1 36 [3, ] -1 1 37 [4, ] -1 1 38 [5, ] -1 1 39 [6, ] -1 1 40 [7, ] -1 1 41 [8, ] -1 1 42 [9, ] -1 1 43 [10, ] -1 1 44 [11, ] -1 1 45 [12, ] -1 1 46 [13, ] -1 1 47 [14, ] -1 1 5 [1, ] -1 1 52 [15, ] -1 1 57 [16, ] -1 1 58 [17, ] -1 1 68 [18, ] -1 1 69 [19, ] -1 1 7 [2, ] -1 1 71 [21, ] -1 2 2 [21, ] - diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_exists_docids.snap deleted file mode 100644 index a7ee4348d..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_exists_docids.snap +++ /dev/null @@ -1,6 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -1 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ] -2 [20, 21, 22, ] - diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap deleted file mode 100644 index cfa649653..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap +++ /dev/null @@ -1,6 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -2 0 1.2 1 [20, 22, ] -2 0 2.2 1 [21, ] - diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap deleted file mode 100644 index 8336bd712..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap +++ /dev/null @@ -1,19 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -1 0 abstract 1 [2, 6, 10, 13, 14, 15, 16, 17, ] -1 0 aquarium 1 [5, ] -1 0 art 1 [4, 5, 8, 9, 10, 12, 17, ] -1 0 cartoon 1 [2, 7, 15, 17, ] -1 0 colorfulness 1 [13, ] -1 0 design 1 [2, 18, ] -1 0 drawing 1 [3, 4, 5, 8, 10, 11, 16, ] -1 0 geometry 1 [19, ] -1 0 letter 1 [1, ] -1 0 outdoor 1 [4, ] -1 0 painting 1 [3, ] -1 0 pattern 1 [2, 3, 9, 10, 13, 14, 16, ] -1 0 sign 1 [0, ] -2 0 design 1 [21, ] -2 0 geometry 1 [20, 22, ] - diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap deleted file mode 100644 index dfac98e59..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[0, 20, 22, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_docids.snap deleted file mode 100644 index 972a733e2..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_docids.snap +++ /dev/null @@ -1,42 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -1 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, ] -2 [20, 21, 22, ] -36 [3, ] -37 [4, ] -38 [5, ] -39 [6, ] -4 [0, ] -40 [7, ] -41 [8, ] -42 [9, ] -43 [10, ] -44 [11, ] -45 [12, ] -46 [13, ] -47 [14, ] -5 [1, ] -52 [15, ] -57 [16, ] -58 [17, ] -68 [18, ] -69 [19, ] -7 [2, ] -70 [20, ] -71 [21, ] -72 [22, ] -abstract [2, 6, 10, 13, 14, 15, 16, 17, ] -aquarium [5, ] -art [4, 5, 8, 9, 10, 12, 17, ] -cartoon [2, 7, 15, 17, ] -colorfulness [13, ] -design [2, 18, 21, ] -drawing [3, 4, 5, 8, 10, 11, 16, ] -geometry [19, 20, 22, ] -letter [1, ] -outdoor [4, ] -painting [3, ] -pattern [2, 3, 9, 10, 13, 14, 16, ] -sign [0, ] - diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_pair_proximity_docids.snap deleted file mode 100644 index 941838e34..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_pair_proximity_docids.snap +++ /dev/null @@ -1,29 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -1 1 2 [20, 22, ] -1 1 36 [3, ] -1 1 37 [4, ] -1 1 38 [5, ] -1 1 39 [6, ] -1 1 4 [0, ] -1 1 40 [7, ] -1 1 41 [8, ] -1 1 42 [9, ] -1 1 43 [10, ] -1 1 44 [11, ] -1 1 45 [12, ] -1 1 46 [13, ] -1 1 47 [14, ] -1 1 5 [1, ] -1 1 52 [15, ] -1 1 57 [16, ] -1 1 58 [17, ] -1 1 68 [18, ] -1 1 69 [19, ] -1 1 7 [2, ] -1 1 70 [20, ] -1 1 71 [21, ] -1 1 72 [22, ] -1 2 2 [21, ] - diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap deleted file mode 100644 index 18a9d9309..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap +++ /dev/null @@ -1,31 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -3 0 48.9021 1 [19, ] -3 0 49.9314 1 [17, ] -3 0 50.1793 1 [15, ] -3 0 50.2844 1 [14, ] -3 0 50.3518 1 [13, ] -3 0 50.4502 1 [12, ] -3 0 50.6053 1 [8, ] -3 0 50.6224 1 [3, ] -3 0 50.6299 1 [0, ] -3 0 50.6312 1 [2, ] -3 0 50.6415 1 [1, ] -3 0 50.7453 1 [7, ] -3 0 50.8466 1 [10, ] -3 0 51.0537 1 [9, ] -4 0 2.271 1 [17, ] -4 0 2.3708 1 [19, ] -4 0 2.7637 1 [14, ] -4 0 3.0569 1 [0, ] -4 0 3.1106 1 [1, 2, ] -4 0 3.1476 1 [3, ] -4 0 3.2189 1 [15, ] -4 0 3.2206 1 [7, ] -4 0 3.3758 1 [8, ] -4 0 3.5326 1 [13, ] -4 0 3.6957 1 [9, ] -4 0 3.9623 1 [12, ] -4 0 4.337 1 [10, ] - diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap deleted file mode 100644 index e87bce206..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[] diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap deleted file mode 100644 index c909a3cd8..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap +++ /dev/null @@ -1,53 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -3 0 48.9021 1 [19, ] -3 0 49.4449 1 [18, ] -3 0 49.9314 1 [17, ] -3 0 50.1112 1 [16, ] -3 0 50.1793 1 [15, ] -3 0 50.2844 1 [14, ] -3 0 50.3518 1 [13, ] -3 0 50.4095 1 [11, ] -3 0 50.4502 1 [12, ] -3 0 50.6053 1 [8, ] -3 0 50.6224 1 [3, ] -3 0 50.6299 1 [0, ] -3 0 50.6312 1 [2, ] -3 0 50.6415 1 [1, ] -3 0 50.6552 1 [4, ] -3 0 50.6924 1 [5, ] -3 0 50.7263 1 [6, ] -3 0 50.7453 1 [7, ] -3 0 50.8466 1 [10, ] -3 0 51.0537 1 [9, ] -3 1 48.9021 4 [16, 17, 18, 19, ] -3 1 50.1793 4 [11, 13, 14, 15, ] -3 1 50.4502 4 [0, 3, 8, 12, ] -3 1 50.6312 4 [1, 2, 4, 5, ] -3 1 50.7263 4 [6, 7, 9, 10, ] -4 0 2.271 1 [17, ] -4 0 2.3708 1 [19, ] -4 0 2.7637 1 [14, ] -4 0 2.7913 1 [18, ] -4 0 2.8547 1 [16, ] -4 0 3.0569 1 [0, ] -4 0 3.1106 1 [1, 2, ] -4 0 3.1476 1 [3, ] -4 0 3.1541 1 [6, ] -4 0 3.1763 1 [5, ] -4 0 3.1897 1 [4, ] -4 0 3.2189 1 [15, ] -4 0 3.2206 1 [7, ] -4 0 3.3758 1 [8, ] -4 0 3.5326 1 [13, ] -4 0 3.6957 1 [9, ] -4 0 3.9623 1 [12, ] -4 0 4.337 1 [10, ] -4 0 4.4347 1 [11, ] -4 1 2.271 4 [14, 17, 18, 19, ] -4 1 2.8547 4 [0, 1, 2, 3, 16, ] -4 1 3.1541 4 [4, 5, 6, 15, ] -4 1 3.2206 4 [7, 8, 9, 13, ] -4 1 3.9623 3 [10, 11, 12, ] - diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap deleted file mode 100644 index 1260b12de..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[4, 5, 6, 11, 16, 18, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap deleted file mode 100644 index e87bce206..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[] diff --git a/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap deleted file mode 100644 index efcd7af8c..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[2, 15, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap deleted file mode 100644 index e87bce206..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[] diff --git a/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap deleted file mode 100644 index efcd7af8c..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[2, 15, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap deleted file mode 100644 index e87bce206..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[] diff --git a/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap deleted file mode 100644 index efcd7af8c..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- -[2, 15, ] From b40253bf18bbb1a2981124936028a5e9e4208d78 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 31 Oct 2023 10:15:34 +0100 Subject: [PATCH 080/127] update snapshots --- .../facet_id_exists_docids.snap | 6 +++ .../facet_id_f64_docids.snap | 5 +++ .../facet_id_string_docids.snap | 17 +++++++++ .../word_docids.snap | 38 +++++++++++++++++++ .../word_pair_proximity_docids.snap | 25 ++++++++++++ 5 files changed, 91 insertions(+) create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_exists_docids.snap create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_string_docids.snap create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_docids.snap create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_pair_proximity_docids.snap diff --git a/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_exists_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_exists_docids.snap new file mode 100644 index 000000000..ed120bf02 --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_exists_docids.snap @@ -0,0 +1,6 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- +1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ] +2 [21, ] + diff --git a/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap new file mode 100644 index 000000000..deeddff0d --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- +2 0 2.2 1 [21, ] + diff --git a/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_string_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_string_docids.snap new file mode 100644 index 000000000..2d0b98623 --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_string_docids.snap @@ -0,0 +1,17 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- +1 0 abstract 1 [2, 6, 10, 13, 14, 15, 16, 17, ] +1 0 aquarium 1 [5, ] +1 0 art 1 [4, 5, 8, 9, 10, 12, 17, ] +1 0 cartoon 1 [2, 7, 15, 17, ] +1 0 colorfulness 1 [13, ] +1 0 design 1 [2, 18, ] +1 0 drawing 1 [3, 4, 5, 8, 10, 11, 16, ] +1 0 geometry 1 [19, ] +1 0 letter 1 [1, ] +1 0 outdoor 1 [4, ] +1 0 painting 1 [3, ] +1 0 pattern 1 [2, 3, 9, 10, 13, 14, 16, ] +2 0 design 1 [21, ] + diff --git a/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_docids.snap new file mode 100644 index 000000000..73503f098 --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_docids.snap @@ -0,0 +1,38 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- +1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, ] +2 [21, ] +36 [3, ] +37 [4, ] +38 [5, ] +39 [6, ] +40 [7, ] +41 [8, ] +42 [9, ] +43 [10, ] +44 [11, ] +45 [12, ] +46 [13, ] +47 [14, ] +5 [1, ] +52 [15, ] +57 [16, ] +58 [17, ] +68 [18, ] +69 [19, ] +7 [2, ] +71 [21, ] +abstract [2, 6, 10, 13, 14, 15, 16, 17, ] +aquarium [5, ] +art [4, 5, 8, 9, 10, 12, 17, ] +cartoon [2, 7, 15, 17, ] +colorfulness [13, ] +design [2, 18, 21, ] +drawing [3, 4, 5, 8, 10, 11, 16, ] +geometry [19, ] +letter [1, ] +outdoor [4, ] +painting [3, ] +pattern [2, 3, 9, 10, 13, 14, 16, ] + diff --git a/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_pair_proximity_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_pair_proximity_docids.snap new file mode 100644 index 000000000..022e9f5b1 --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_pair_proximity_docids.snap @@ -0,0 +1,25 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- +1 1 36 [3, ] +1 1 37 [4, ] +1 1 38 [5, ] +1 1 39 [6, ] +1 1 40 [7, ] +1 1 41 [8, ] +1 1 42 [9, ] +1 1 43 [10, ] +1 1 44 [11, ] +1 1 45 [12, ] +1 1 46 [13, ] +1 1 47 [14, ] +1 1 5 [1, ] +1 1 52 [15, ] +1 1 57 [16, ] +1 1 58 [17, ] +1 1 68 [18, ] +1 1 69 [19, ] +1 1 7 [2, ] +1 1 71 [21, ] +1 2 2 [21, ] + From 94206b00552523c104bebf720716bf53a3c6e12a Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 31 Oct 2023 13:48:47 +0100 Subject: [PATCH 081/127] Update tests --- milli/src/search/new/tests/proximity.rs | 10 +++++----- ...__new__tests__proximity__proximity_prefix_db-8.snap | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/milli/src/search/new/tests/proximity.rs b/milli/src/search/new/tests/proximity.rs index 4d340ae1c..217ebe9b3 100644 --- a/milli/src/search/new/tests/proximity.rs +++ b/milli/src/search/new/tests/proximity.rs @@ -423,20 +423,20 @@ fn test_proximity_prefix_db() { s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed); s.query("best win"); let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[15, 16, 17, 18, 19, 20, 21, 22]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]"); insta::assert_snapshot!(format!("{document_scores:#?}")); let texts = collect_field_values(&index, &txn, "text", &documents_ids); insta::assert_debug_snapshot!(texts, @r###" [ + "\"this is the best winter meal\"", + "\"winter best\"", + "\"this is the best meal of winter\"", + "\"winter x best\"", "\"this is the best meal I have ever had in such a beautiful winter day\"", "\"this is the best cooked meal of the winter\"", "\"this is the best meal of the winter\"", - "\"this is the best meal of winter\"", - "\"this is the best winter meal\"", "\"winter x y best\"", - "\"winter x best\"", - "\"winter best\"", ] "###); diff --git a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-8.snap b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-8.snap index 5129f1b3b..8f3b964c1 100644 --- a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-8.snap +++ b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-8.snap @@ -6,7 +6,7 @@ expression: "format!(\"{document_scores:#?}\")" [ Proximity( Rank { - rank: 1, + rank: 4, max_rank: 4, }, ), @@ -14,7 +14,7 @@ expression: "format!(\"{document_scores:#?}\")" [ Proximity( Rank { - rank: 1, + rank: 3, max_rank: 4, }, ), @@ -22,7 +22,7 @@ expression: "format!(\"{document_scores:#?}\")" [ Proximity( Rank { - rank: 1, + rank: 2, max_rank: 4, }, ), @@ -30,7 +30,7 @@ expression: "format!(\"{document_scores:#?}\")" [ Proximity( Rank { - rank: 1, + rank: 2, max_rank: 4, }, ), From da0503ef80f57cff27eb521aa3089f4146eff2c3 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 31 Oct 2023 10:57:08 +0100 Subject: [PATCH 082/127] Fix document count --- milli/src/update/index_documents/mod.rs | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index ad2f63beb..0174fe319 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -371,12 +371,11 @@ where let _ = lmdb_writer_sx.send(Err(e)); } - // needs to be droped to avoid channel waiting lock. + // needs to be dropped to avoid channel waiting lock. drop(lmdb_writer_sx) }); - let index_documents_ids = self.index.documents_ids(self.wtxn)?; - let index_is_empty = index_documents_ids.is_empty(); + let index_is_empty = self.index.number_of_documents(self.wtxn)? == 0; let mut final_documents_ids = RoaringBitmap::new(); let mut databases_seen = 0; @@ -422,16 +421,6 @@ where // We write the primary key field id into the main database self.index.put_primary_key(self.wtxn, &primary_key)?; - // We write the external documents ids into the main database. - //let mut external_documents_ids = self.index.external_documents_ids(self.wtxn)?; - //external_documents_ids.insert_ids(&new_external_documents_ids)?; - //let external_documents_ids = external_documents_ids.into_static(); - //self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?; - - // FIXME: remove `new_documents_ids` entirely and `replaced_documents_ids` - let all_documents_ids = index_documents_ids | new_documents_ids; - //self.index.put_documents_ids(self.wtxn, &all_documents_ids)?; - // TODO: reactivate prefix DB with diff-indexing // self.execute_prefix_databases( // word_docids, @@ -441,7 +430,7 @@ where // word_fid_docids, // )?; - Ok(all_documents_ids.len()) + self.index.number_of_documents(self.wtxn) } #[logging_timer::time("IndexDocuments::{}")] From c855cc27215ca6a88ae41e010872b9ab337be4c1 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 31 Oct 2023 11:54:15 +0100 Subject: [PATCH 083/127] Remove unused test --- milli/src/index.rs | 258 --------------------------------------------- 1 file changed, 258 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index f7450a672..a52033fb6 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1955,264 +1955,6 @@ pub(crate) mod tests { "###); } - #[test] - fn replace_documents_in_batches_external_ids_and_soft_deletion_check() { - use big_s::S; - use maplit::hashset; - - let index = TempIndex::new(); - - index - .update_settings(|settings| { - settings.set_primary_key("id".to_owned()); - settings.set_filterable_fields(hashset! { S("doggo") }); - }) - .unwrap(); - - let add_documents = |index: &TempIndex, docs: Vec>| { - let mut wtxn = index.write_txn().unwrap(); - let mut builder = IndexDocuments::new( - &mut wtxn, - index, - &index.indexer_config, - index.index_documents_config.clone(), - |_| (), - || false, - ) - .unwrap(); - for docs in docs { - (builder, _) = builder.add_documents(documents!(docs)).unwrap(); - } - builder.execute().unwrap(); - wtxn.commit().unwrap(); - }; - // First Batch - { - let mut docs1 = vec![]; - for i in 0..4 { - docs1.push(serde_json::json!( - { "id": i, "doggo": i } - )); - } - add_documents(&index, vec![docs1]); - - db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]"); - db_snap!(index, external_documents_ids, 1, @r###" - docids: - 0 0 - 1 1 - 2 2 - 3 3 - "###); - db_snap!(index, facet_id_f64_docids, 1, @r###" - 1 0 0 1 [0, ] - 1 0 1 1 [1, ] - 1 0 2 1 [2, ] - 1 0 3 1 [3, ] - "###); - } - // Second Batch: replace the documents with soft-deletion - { - let mut docs1 = vec![]; - for i in 0..3 { - docs1.push(serde_json::json!( - { "id": i, "doggo": i+1 } - )); - } - let mut docs2 = vec![]; - for i in 0..3 { - docs2.push(serde_json::json!( - { "id": i, "doggo": i } - )); - } - add_documents(&index, vec![docs1, docs2]); - - db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]"); - db_snap!(index, external_documents_ids, 1, @r###" - docids: - 0 0 - 1 1 - 2 2 - 3 3 - "###); - db_snap!(index, facet_id_f64_docids, 1, @r###" - 1 0 0 1 [0, ] - 1 0 1 1 [1, ] - 1 0 2 1 [2, ] - 1 0 3 1 [3, ] - "###); - } - let rtxn = index.read_txn().unwrap(); - let (_docid, obkv) = index.documents(&rtxn, [3]).unwrap()[0]; - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(3), - "doggo": Number(3), - } - "###); - let (_docid, obkv) = index.documents(&rtxn, [4]).unwrap()[0]; - - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(0), - "doggo": Number(0), - } - "###); - let (_docid, obkv) = index.documents(&rtxn, [5]).unwrap()[0]; - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(1), - "doggo": Number(1), - } - "###); - let (_docid, obkv) = index.documents(&rtxn, [6]).unwrap()[0]; - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(2), - "doggo": Number(2), - } - "###); - drop(rtxn); - // Third Batch: replace the documents with soft-deletion again - { - let mut docs1 = vec![]; - for i in 0..3 { - docs1.push(serde_json::json!( - { "id": i, "doggo": i+1 } - )); - } - let mut docs2 = vec![]; - for i in 0..4 { - docs2.push(serde_json::json!( - { "id": i, "doggo": i } - )); - } - add_documents(&index, vec![docs1, docs2]); - - db_snap!(index, documents_ids, @"[3, 7, 8, 9, ]"); - db_snap!(index, external_documents_ids, 1, @r###" - soft: - hard: - 0 7 - 1 8 - 2 9 - 3 3 - "###); - db_snap!(index, facet_id_f64_docids, 1, @r###" - 1 0 0 1 [0, 4, 7, ] - 1 0 1 1 [1, 5, 8, ] - 1 0 2 1 [2, 6, 9, ] - 1 0 3 1 [3, ] - "###); - } - let rtxn = index.read_txn().unwrap(); - let (_docid, obkv) = index.documents(&rtxn, [3]).unwrap()[0]; - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(3), - "doggo": Number(3), - } - "###); - let (_docid, obkv) = index.documents(&rtxn, [7]).unwrap()[0]; - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(0), - "doggo": Number(0), - } - "###); - let (_docid, obkv) = index.documents(&rtxn, [8]).unwrap()[0]; - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(1), - "doggo": Number(1), - } - "###); - let (_docid, obkv) = index.documents(&rtxn, [9]).unwrap()[0]; - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(2), - "doggo": Number(2), - } - "###); - drop(rtxn); - - // Fourth Batch: replace the documents without soft-deletion - { - let mut docs1 = vec![]; - for i in 0..3 { - docs1.push(serde_json::json!( - { "id": i, "doggo": i+2 } - )); - } - let mut docs2 = vec![]; - for i in 0..1 { - docs2.push(serde_json::json!( - { "id": i, "doggo": i } - )); - } - add_documents(&index, vec![docs1, docs2]); - - db_snap!(index, documents_ids, @"[3, 10, 11, 12, ]"); - db_snap!(index, external_documents_ids, 1, @r###" - soft: - hard: - 0 10 - 1 11 - 2 12 - 3 3 - "###); - - db_snap!(index, facet_id_f64_docids, 1, @r###" - 1 0 0 1 [10, ] - 1 0 3 1 [3, 11, ] - 1 0 4 1 [12, ] - "###); - - let rtxn = index.read_txn().unwrap(); - let (_docid, obkv) = index.documents(&rtxn, [3]).unwrap()[0]; - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(3), - "doggo": Number(3), - } - "###); - let (_docid, obkv) = index.documents(&rtxn, [10]).unwrap()[0]; - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(0), - "doggo": Number(0), - } - "###); - let (_docid, obkv) = index.documents(&rtxn, [11]).unwrap()[0]; - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(1), - "doggo": Number(3), - } - "###); - let (_docid, obkv) = index.documents(&rtxn, [12]).unwrap()[0]; - let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); - insta::assert_debug_snapshot!(json, @r###" - { - "id": Number(2), - "doggo": Number(4), - } - "###); - drop(rtxn); - } - } - #[test] fn bug_3021_first() { // https://github.com/meilisearch/meilisearch/issues/3021 From 03ddb4f3106466f9e4056835c0285604097927af Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 31 Oct 2023 11:54:22 +0100 Subject: [PATCH 084/127] use deladd in facet update tests --- milli/src/update/facet/mod.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 05e6a93d8..0839acf08 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -278,6 +278,7 @@ pub(crate) mod test_helpers { use crate::heed_codec::ByteSliceRefCodec; use crate::search::facet::get_highest_level; use crate::snapshot_tests::display_bitmap; + use crate::update::del_add::{DelAdd, KvWriterDelAdd}; use crate::update::FacetsUpdateIncrementalInner; use crate::CboRoaringBitmapCodec; @@ -454,8 +455,10 @@ pub(crate) mod test_helpers { let key: FacetGroupKey<&[u8]> = FacetGroupKey { field_id: *field_id, level: 0, left_bound: &left_bound_bytes }; let key = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); + let mut inner_writer = KvWriterDelAdd::memory(); let value = CboRoaringBitmapCodec::bytes_encode(docids).unwrap(); - writer.insert(&key, &value).unwrap(); + inner_writer.insert(DelAdd::Addition, value).unwrap(); + writer.insert(&key, inner_writer.into_inner().unwrap()).unwrap(); } writer.finish().unwrap(); let reader = grenad::Reader::new(std::io::Cursor::new(new_data)).unwrap(); From f19332466eea45212bbefaa37e4736b5917511c4 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 31 Oct 2023 16:35:38 +0100 Subject: [PATCH 085/127] Extract field value as values instead of Option --- .../index_documents/extract/extract_fid_docid_facet_values.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 87320a675..2dce90cfc 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -102,11 +102,11 @@ pub fn extract_fid_docid_facet_values( let del_add_obkv = obkv::KvReader::new(field_bytes); let del_value = match del_add_obkv.get(DelAdd::Deletion) { - Some(bytes) => from_slice(bytes).map_err(InternalError::SerdeJson)?, + Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?), None => None, }; let add_value = match del_add_obkv.get(DelAdd::Addition) { - Some(bytes) => from_slice(bytes).map_err(InternalError::SerdeJson)?, + Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?), None => None, }; From b1d1355b6983f099f770f6b1a453915fee88dcaf Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 31 Oct 2023 16:36:18 +0100 Subject: [PATCH 086/127] remove tests on soft-deleted --- milli/src/update/facet/mod.rs | 85 -------------- milli/src/update/index_documents/mod.rs | 7 +- milli/src/update/prefix_word_pairs/mod.rs | 133 ---------------------- 3 files changed, 3 insertions(+), 222 deletions(-) diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 0839acf08..7358ceb6c 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -558,91 +558,6 @@ pub(crate) mod test_helpers { } } -#[cfg(test)] -mod tests { - use big_s::S; - use maplit::hashset; - - use crate::db_snap; - use crate::documents::documents_batch_reader_from_objects; - use crate::index::tests::TempIndex; - - #[test] - fn replace_all_identical_soft_deletion_then_hard_deletion() { - let index = TempIndex::new_with_map_size(4096 * 1000 * 100); - - index - .update_settings(|settings| { - settings.set_primary_key("id".to_owned()); - settings.set_filterable_fields(hashset! { S("size") }); - }) - .unwrap(); - - let mut documents = vec![]; - for i in 0..1000 { - documents.push( - serde_json::json! { - { - "id": i, - "size": i % 250, - } - } - .as_object() - .unwrap() - .clone(), - ); - } - - let documents = documents_batch_reader_from_objects(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, facet_id_f64_docids, "initial", @"777e0e221d778764b472c512617eeb3b"); - - let mut documents = vec![]; - for i in 0..999 { - documents.push( - serde_json::json! { - { - "id": i, - "size": i % 250, - "other": 0, - } - } - .as_object() - .unwrap() - .clone(), - ); - } - - let documents = documents_batch_reader_from_objects(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, facet_id_f64_docids, "replaced_1_soft", @"abba175d7bed727d0efadaef85a4388f"); - - // Then replace the last document while disabling soft_deletion - let mut documents = vec![]; - for i in 999..1000 { - documents.push( - serde_json::json! { - { - "id": i, - "size": i % 250, - "other": 0, - } - } - .as_object() - .unwrap() - .clone(), - ); - } - - let documents = documents_batch_reader_from_objects(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, facet_id_f64_docids, "replaced_2_hard", @"029e27a46d09c574ae949aa4289b45e6"); - } -} - #[allow(unused)] #[cfg(test)] mod comparison_bench { diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 0174fe319..c32f907b2 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -763,11 +763,10 @@ mod tests { assert_eq!(count, 1); // Check that we get only one document from the database. - // Since the document has been deleted and re-inserted, its internal docid has been incremented to 1 - let docs = index.documents(&rtxn, Some(1)).unwrap(); + let docs = index.documents(&rtxn, Some(0)).unwrap(); assert_eq!(docs.len(), 1); let (id, doc) = docs[0]; - assert_eq!(id, 1); + assert_eq!(id, 0); // Check that this document is equal to the last one sent. let mut doc_iter = doc.iter(); @@ -828,7 +827,7 @@ mod tests { assert_eq!(count, 3); // the document 0 has been deleted and reinserted with the id 3 - let docs = index.documents(&rtxn, vec![1, 2, 3]).unwrap(); + let docs = index.documents(&rtxn, vec![1, 2, 0]).unwrap(); let kevin_position = docs.iter().position(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap(); assert_eq!(kevin_position, 2); diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs index 1ec57e080..e718f9b77 100644 --- a/milli/src/update/prefix_word_pairs/mod.rs +++ b/milli/src/update/prefix_word_pairs/mod.rs @@ -357,139 +357,6 @@ mod tests { db_snap!(index, prefix_word_pair_proximity_docids, "reupdate"); } - #[test] - fn soft_delete_and_reupdate() { - let mut index = TempIndex::new(); - index.index_documents_config.words_prefix_threshold = Some(50); - - index - .update_settings(|settings| { - settings.set_primary_key("id".to_owned()); - settings.set_searchable_fields(vec!["text".to_owned()]); - }) - .unwrap(); - - let batch_reader_from_documents = |documents| { - let mut builder = DocumentsBatchBuilder::new(Vec::new()); - for object in documents { - builder.append_json_object(&object).unwrap(); - } - DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() - }; - - let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); - // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database - documents.push( - serde_json::json!({ - "id": 9000, - "text": "At an amazing and beautiful house" - }) - .as_object() - .unwrap() - .clone(), - ); - documents.push( - serde_json::json!({ - "id": 9001, - "text": "The bell rings at 5 am" - }) - .as_object() - .unwrap() - .clone(), - ); - - let documents = batch_reader_from_documents(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, documents_ids, "initial"); - db_snap!(index, word_docids, "initial"); - db_snap!(index, word_prefix_pair_proximity_docids, "initial"); - db_snap!(index, prefix_word_pair_proximity_docids, "initial"); - - index.delete_document("9000"); - - db_snap!(index, documents_ids, "first_delete"); - db_snap!(index, word_docids, "first_delete"); - db_snap!(index, word_prefix_pair_proximity_docids, "first_delete"); - db_snap!(index, prefix_word_pair_proximity_docids, "first_delete"); - - index.delete_documents((0..50).map(|id| id.to_string()).collect()); - - db_snap!(index, documents_ids, "second_delete"); - db_snap!(index, word_docids, "second_delete"); - db_snap!(index, word_prefix_pair_proximity_docids, "second_delete"); - db_snap!(index, prefix_word_pair_proximity_docids, "second_delete"); - - let documents = documents_with_enough_different_words_for_prefixes(&["b"], 1000); - // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database - - index.add_documents(batch_reader_from_documents(documents)).unwrap(); - - db_snap!(index, documents_ids, "reupdate"); - db_snap!(index, word_docids, "reupdate"); - db_snap!(index, word_prefix_pair_proximity_docids, "reupdate"); - db_snap!(index, prefix_word_pair_proximity_docids, "reupdate"); - } - - #[test] - fn replace_soft_deletion() { - let mut index = TempIndex::new(); - index.index_documents_config.words_prefix_threshold = Some(50); - index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; - - index - .update_settings(|settings| { - settings.set_primary_key("id".to_owned()); - settings.set_searchable_fields(vec!["text".to_owned()]); - }) - .unwrap(); - - let batch_reader_from_documents = |documents| { - let mut builder = DocumentsBatchBuilder::new(Vec::new()); - for object in documents { - builder.append_json_object(&object).unwrap(); - } - DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() - }; - - let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); - // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database - documents.push( - serde_json::json!({ - "id": 9000, - "text": "At an amazing house" - }) - .as_object() - .unwrap() - .clone(), - ); - documents.push( - serde_json::json!({ - "id": 9001, - "text": "The bell rings" - }) - .as_object() - .unwrap() - .clone(), - ); - - let documents = batch_reader_from_documents(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, documents_ids, "initial"); - db_snap!(index, word_docids, "initial"); - db_snap!(index, word_prefix_pair_proximity_docids, "initial"); - db_snap!(index, prefix_word_pair_proximity_docids, "initial"); - - let documents = documents_with_enough_different_words_for_prefixes(&["b"], 0); - index.add_documents(batch_reader_from_documents(documents)).unwrap(); - - db_snap!(index, documents_ids, "replaced"); - db_snap!(index, word_docids, "replaced"); - db_snap!(index, word_prefix_pair_proximity_docids, "replaced"); - db_snap!(index, prefix_word_pair_proximity_docids, "replaced"); - } - #[test] fn replace_hard_deletion() { let mut index = TempIndex::new(); From 0fb6acefc3503c220796684a1e64fd550358d0eb Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 31 Oct 2023 17:11:08 +0100 Subject: [PATCH 087/127] Add snapshots for facets --- .../update/facet/snapshots/bulk.rs/insert/default.hash.snap | 4 ++++ .../bulk.rs/insert/large_group_small_min_level.hash.snap | 4 ++++ .../bulk.rs/insert/odd_group_odd_min_level.hash.snap | 4 ++++ .../bulk.rs/insert/small_group_large_min_level.hash.snap | 4 ++++ .../bulk.rs/insert/small_group_small_min_level.hash.snap | 4 ++++ .../bulk.rs/insert_delete_field_insert/default.hash.snap | 4 ++++ .../large_group_small_min_level.hash.snap | 4 ++++ .../odd_group_odd_min_level.hash.snap | 4 ++++ .../small_group_large_min_level.hash.snap | 4 ++++ .../small_group_small_min_level.hash.snap | 4 ++++ .../facet/snapshots/bulk.rs/insert_string/default.hash.snap | 4 ++++ .../insert_string/large_group_small_min_level.hash.snap | 4 ++++ .../bulk.rs/insert_string/odd_group_odd_min_level.hash.snap | 4 ++++ .../insert_string/small_group_large_min_level.hash.snap | 4 ++++ .../insert_string/small_group_small_min_level.hash.snap | 4 ++++ 15 files changed, 60 insertions(+) create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert/default.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert/large_group_small_min_level.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert/odd_group_odd_min_level.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert/small_group_large_min_level.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_string/default.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_string/large_group_small_min_level.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_string/odd_group_odd_min_level.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_large_min_level.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_small_min_level.hash.snap diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert/default.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/default.hash.snap new file mode 100644 index 000000000..bef20823c --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert/default.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +b40dd31a65e033ffc6b35c027ce19506 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert/large_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/large_group_small_min_level.hash.snap new file mode 100644 index 000000000..74c40e6a3 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert/large_group_small_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +7ee22d8e9387e72758f00918eb67e4c6 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert/odd_group_odd_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/odd_group_odd_min_level.hash.snap new file mode 100644 index 000000000..6fb086d35 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert/odd_group_odd_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +60f567359382507afdaf45fb075740c3 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_large_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_large_min_level.hash.snap new file mode 100644 index 000000000..0271a6c6b --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_large_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +b986d6e6cbf425685f409a8b417010e1 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap new file mode 100644 index 000000000..d801ef19f --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +ee10dd2ae2b5c6621a89a5d0a9aa8ccc diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap new file mode 100644 index 000000000..e9988f527 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +fa877559eef78b383b496c15a364a2dc diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap new file mode 100644 index 000000000..aa52901da --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +16a96353bc42f2ff3e91611ca4d5b184 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap new file mode 100644 index 000000000..64f5012a4 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +be1b08073b9d9788d18080c1320151d7 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap new file mode 100644 index 000000000..aa52901da --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +16a96353bc42f2ff3e91611ca4d5b184 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap new file mode 100644 index 000000000..bb0e9aa69 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +32a45d555df2e001420fea149818d376 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/default.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/default.hash.snap new file mode 100644 index 000000000..b7705b72e --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_string/default.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +353d70f52eea66e5031dca989ea8a037 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/large_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/large_group_small_min_level.hash.snap new file mode 100644 index 000000000..15030a1ea --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_string/large_group_small_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +52a093c909133d84023a4a7b83864808 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/odd_group_odd_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/odd_group_odd_min_level.hash.snap new file mode 100644 index 000000000..949ec6647 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_string/odd_group_odd_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +9d86c72ddb241d0aeca2995d61a3648a diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_large_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_large_min_level.hash.snap new file mode 100644 index 000000000..d8797f1ab --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_large_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +c0943177594534bfe5527cbf40fe388e diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_small_min_level.hash.snap new file mode 100644 index 000000000..f7949c5f3 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_small_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +6ed86f234028ae3df5881bee5512f11e From 0fc446c62f07ce4e5802a2affc39abdcd6a0ef1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 1 Nov 2023 10:07:03 +0100 Subject: [PATCH 088/127] Add more timing logs to the Transform --- milli/src/update/index_documents/transform.rs | 130 ++++++------------ 1 file changed, 44 insertions(+), 86 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 840bade2e..23b5c78c1 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -150,6 +150,7 @@ impl<'a, 'i> Transform<'a, 'i> { }) } + #[logging_timer::time] pub fn read_documents( &mut self, reader: EnrichedDocumentsBatchReader, @@ -162,6 +163,8 @@ impl<'a, 'i> Transform<'a, 'i> { FP: Fn(UpdateIndexingStep) + Sync, FA: Fn() -> bool + Sync, { + puffin::profile_function!(); + let (mut cursor, fields_index) = reader.into_cursor_and_fields_index(); let external_documents_ids = self.index.external_documents_ids(); let mapping = create_fields_mapping(&mut self.fields_ids_map, &fields_index)?; @@ -212,13 +215,12 @@ impl<'a, 'i> Transform<'a, 'i> { field_buffer_cache.sort_unstable_by(|(f1, _), (f2, _)| f1.cmp(f2)); // Build the new obkv document. - let mut writer = obkv::KvWriter::new(&mut obkv_buffer); + let mut writer = KvWriter::new(&mut obkv_buffer); for (k, v) in field_buffer_cache.iter() { writer.insert(*k, v)?; } let mut original_docid = None; - let docid = match self.new_external_documents_ids_builder.entry((*external_id).into()) { HEntry::Occupied(entry) => *entry.get() as u32, HEntry::Vacant(entry) => { @@ -275,24 +277,19 @@ impl<'a, 'i> Transform<'a, 'i> { &mut document_sorter_buffer, )?; self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?; - match self.flatten_from_fields_ids_map(KvReader::new(base_obkv))? { - Some(flattened_obkv) => { - // we recreate our buffer with the flattened documents - document_sorter_buffer.clear(); - document_sorter_buffer.push(Operation::Addition as u8); - into_del_add_obkv( - KvReaderU16::new(&flattened_obkv), - true, - keep_original_version, - &mut document_sorter_buffer, - )?; - self.flattened_sorter - .insert(docid.to_be_bytes(), &document_sorter_buffer)? - } - None => self - .flattened_sorter - .insert(docid.to_be_bytes(), &document_sorter_buffer)?, + let base_obkv = KvReader::new(base_obkv); + if let Some(flattened_obkv) = self.flatten_from_fields_ids_map(base_obkv)? { + // we recreate our buffer with the flattened documents + document_sorter_buffer.clear(); + document_sorter_buffer.push(Operation::Addition as u8); + into_del_add_obkv( + KvReaderU16::new(&flattened_obkv), + true, + keep_original_version, + &mut document_sorter_buffer, + )?; } + self.flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?; } } @@ -310,23 +307,18 @@ impl<'a, 'i> Transform<'a, 'i> { // We use the extracted/generated user id as the key for this document. self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?; - match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? { - Some(flattened_obkv) => { - document_sorter_buffer.clear(); - document_sorter_buffer.push(Operation::Addition as u8); - into_del_add_obkv( - KvReaderU16::new(&flattened_obkv), - false, - true, - &mut document_sorter_buffer, - )?; - self.flattened_sorter - .insert(docid.to_be_bytes(), &document_sorter_buffer)? - } - None => self - .flattened_sorter - .insert(docid.to_be_bytes(), &document_sorter_buffer)?, + let flattened_obkv = KvReader::new(&obkv_buffer); + if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? { + document_sorter_buffer.clear(); + document_sorter_buffer.push(Operation::Addition as u8); + into_del_add_obkv( + KvReaderU16::new(&obkv), + false, + true, + &mut document_sorter_buffer, + )? } + self.flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?; } documents_count += 1; @@ -361,6 +353,7 @@ impl<'a, 'i> Transform<'a, 'i> { /// - If the document to remove was inserted by the `read_documents` method before but was NOT present in the db, /// it's added into the grenad to ensure we don't insert it + removed from the list of new documents ids. /// - If the document to remove was not present in either the db or the transform we do nothing. + #[logging_timer::time] pub fn remove_documents( &mut self, mut to_remove: Vec, @@ -370,6 +363,8 @@ impl<'a, 'i> Transform<'a, 'i> { where FA: Fn() -> bool + Sync, { + puffin::profile_function!(); + // there may be duplicates in the documents to remove. to_remove.sort_unstable(); to_remove.dedup(); @@ -439,24 +434,19 @@ impl<'a, 'i> Transform<'a, 'i> { self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?; // flatten it and push it as to delete in the flattened_sorter - match self.flatten_from_fields_ids_map(KvReader::new(base_obkv))? { - Some(flattened_obkv) => { - // we recreate our buffer with the flattened documents - document_sorter_buffer.clear(); - document_sorter_buffer.push(Operation::Deletion as u8); - into_del_add_obkv( - KvReaderU16::new(&flattened_obkv), - true, - false, - &mut document_sorter_buffer, - )?; - self.flattened_sorter - .insert(docid.to_be_bytes(), &document_sorter_buffer)? - } - None => self - .flattened_sorter - .insert(docid.to_be_bytes(), &document_sorter_buffer)?, + let flattened_obkv = KvReader::new(base_obkv); + if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? { + // we recreate our buffer with the flattened documents + document_sorter_buffer.clear(); + document_sorter_buffer.push(Operation::Deletion as u8); + into_del_add_obkv( + KvReaderU16::new(&obkv), + true, + false, + &mut document_sorter_buffer, + )?; } + self.flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?; true } @@ -591,42 +581,10 @@ impl<'a, 'i> Transform<'a, 'i> { Ok(()) } - fn remove_deleted_documents_from_field_distribution( - &self, - rtxn: &RoTxn, - field_distribution: &mut FieldDistribution, - ) -> Result<()> { - for deleted_docid in self.replaced_documents_ids.iter() { - let obkv = self.index.documents.get(rtxn, &BEU32::new(deleted_docid))?.ok_or( - InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, - )?; - - for (key, _) in obkv.iter() { - let name = - self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId { - field_id: key, - process: "Computing field distribution in transform.", - })?; - // We checked that the document was in the db earlier. If we can't find it it means - // there is an inconsistency between the field distribution and the field id map. - let field = - field_distribution.get_mut(name).ok_or(FieldIdMapMissingEntry::FieldId { - field_id: key, - process: "Accessing field distribution in transform.", - })?; - *field -= 1; - if *field == 0 { - // since we were able to get the field right before it's safe to unwrap here - field_distribution.remove(name).unwrap(); - } - } - } - Ok(()) - } - /// Generate the `TransformOutput` based on the given sorter that can be generated from any /// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document /// id for the user side and the value must be an obkv where keys are valid fields ids. + #[logging_timer::time] pub(crate) fn output_from_sorter( self, wtxn: &mut heed::RwTxn, @@ -816,7 +774,7 @@ impl<'a, 'i> Transform<'a, 'i> { let (docid, obkv) = result?; obkv_buffer.clear(); - let mut obkv_writer = obkv::KvWriter::<_, FieldId>::new(&mut obkv_buffer); + let mut obkv_writer = KvWriter::<_, FieldId>::new(&mut obkv_buffer); // We iterate over the new `FieldsIdsMap` ids in order and construct the new obkv. for (id, name) in new_fields_ids_map.iter() { From c71b1d33ae5de96ae013e4695b13bc16263b4c3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 1 Nov 2023 10:39:16 +0100 Subject: [PATCH 089/127] Sort entries using rayon in the transform sorters --- Cargo.lock | 5 +- milli/Cargo.toml | 3 +- milli/src/update/index_documents/transform.rs | 51 +++++++++++++------ 3 files changed, 40 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2ab2f706a..957dffbe4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1664,11 +1664,12 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "grenad" version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5232b2d157b7bf63d7abe1b12177039e58db2f29e377517c0cdee1578cca4c93" +source = "git+https://github.com/meilisearch/grenad?branch=parallel-sorter#eafb6ae795af6078e087edf77e7cd31a26238707" dependencies = [ "bytemuck", "byteorder", + "crossbeam-channel", + "rayon", "tempfile", ] diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 68bc2d2b5..da259c65d 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -26,7 +26,8 @@ flatten-serde-json = { path = "../flatten-serde-json" } fst = "0.4.7" fxhash = "0.2.1" geoutils = "0.5.1" -grenad = { version = "0.4.4", default-features = false, features = [ +grenad = { git = "https://github.com/meilisearch/grenad", branch = "parallel-sorter", default-features = false, features = [ + "rayon", "tempfile", ] } heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.7", default-features = false, features = [ diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 23b5c78c1..8d1750c49 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -114,24 +114,43 @@ impl<'a, 'i> Transform<'a, 'i> { }; // We initialize the sorter with the user indexing settings. - let original_sorter = create_sorter( - grenad::SortAlgorithm::Stable, - merge_function, - indexer_settings.chunk_compression_type, - indexer_settings.chunk_compression_level, - indexer_settings.max_nb_chunks, - indexer_settings.max_memory.map(|mem| mem / 2), - ); + let original_sorter = { + let mut builder = grenad::Sorter::builder(merge_function); + builder.chunk_compression_type(indexer_settings.chunk_compression_type); + if let Some(level) = indexer_settings.chunk_compression_level { + builder.chunk_compression_level(level); + } + if let Some(nb_chunks) = indexer_settings.max_nb_chunks { + builder.max_nb_chunks(nb_chunks); + } + if let Some(memory) = indexer_settings.max_memory.map(|mem| mem / 2) { + builder.dump_threshold(memory); + builder.allow_realloc(false); + } + builder.sort_algorithm(grenad::SortAlgorithm::Stable); + builder.sort_in_parallel(true); + builder.build() + }; // We initialize the sorter with the user indexing settings. - let flattened_sorter = create_sorter( - grenad::SortAlgorithm::Stable, - merge_function, - indexer_settings.chunk_compression_type, - indexer_settings.chunk_compression_level, - indexer_settings.max_nb_chunks, - indexer_settings.max_memory.map(|mem| mem / 2), - ); + let flattened_sorter = { + let mut builder = grenad::Sorter::builder(merge_function); + builder.chunk_compression_type(indexer_settings.chunk_compression_type); + if let Some(level) = indexer_settings.chunk_compression_level { + builder.chunk_compression_level(level); + } + if let Some(nb_chunks) = indexer_settings.max_nb_chunks { + builder.max_nb_chunks(nb_chunks); + } + if let Some(memory) = indexer_settings.max_memory.map(|mem| mem / 2) { + builder.dump_threshold(memory); + builder.allow_realloc(false); + } + builder.sort_algorithm(grenad::SortAlgorithm::Stable); + builder.sort_in_parallel(true); + builder.build() + }; + let documents_ids = index.documents_ids(wtxn)?; Ok(Transform { From e507ef593267795b4a88fde05477e58fb6948724 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 1 Nov 2023 11:06:58 +0100 Subject: [PATCH 090/127] Slow the logging down --- index-scheduler/src/batch.rs | 8 ++++---- meilisearch/src/lib.rs | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index c273d8ebb..ebdba0a8c 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -24,7 +24,7 @@ use std::fs::{self, File}; use std::io::BufWriter; use dump::IndexMetadata; -use log::{debug, error, info}; +use log::{debug, error, info, trace}; use meilisearch_types::error::Code; use meilisearch_types::heed::{RoTxn, RwTxn}; use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader}; @@ -1190,7 +1190,7 @@ impl IndexScheduler { index, indexer_config, config, - |indexing_step| debug!("update: {:?}", indexing_step), + |indexing_step| trace!("update: {:?}", indexing_step), || must_stop_processing.get(), )?; @@ -1268,7 +1268,7 @@ impl IndexScheduler { milli::update::Settings::new(index_wtxn, index, indexer_config); builder.reset_primary_key(); builder.execute( - |indexing_step| debug!("update: {:?}", indexing_step), + |indexing_step| trace!("update: {:?}", indexing_step), || must_stop_processing.clone().get(), )?; } @@ -1288,7 +1288,7 @@ impl IndexScheduler { index, indexer_config, config, - |indexing_step| debug!("update: {:?}", indexing_step), + |indexing_step| trace!("update: {:?}", indexing_step), || must_stop_processing.get(), )?; diff --git a/meilisearch/src/lib.rs b/meilisearch/src/lib.rs index 603d8ff86..16c08c6c2 100644 --- a/meilisearch/src/lib.rs +++ b/meilisearch/src/lib.rs @@ -362,7 +362,7 @@ fn import_dump( update_method: IndexDocumentsMethod::ReplaceDocuments, ..Default::default() }, - |indexing_step| log::debug!("update: {:?}", indexing_step), + |indexing_step| log::trace!("update: {:?}", indexing_step), || false, )?; From b10c060bf7e5c99d4789096d0cf15d4aa9e4fa24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 1 Nov 2023 13:55:18 +0100 Subject: [PATCH 091/127] Cleanup TOML --- Cargo.lock | 6 +++--- milli/Cargo.toml | 5 ++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 957dffbe4..91fdc13be 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1663,12 +1663,12 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "grenad" -version = "0.4.4" -source = "git+https://github.com/meilisearch/grenad?branch=parallel-sorter#eafb6ae795af6078e087edf77e7cd31a26238707" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a007932af5475ebb5c63bef8812bb1c36f317983bb4ca663e9d6dd58d6a0f8c" dependencies = [ "bytemuck", "byteorder", - "crossbeam-channel", "rayon", "tempfile", ] diff --git a/milli/Cargo.toml b/milli/Cargo.toml index da259c65d..9cef4795b 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -26,9 +26,8 @@ flatten-serde-json = { path = "../flatten-serde-json" } fst = "0.4.7" fxhash = "0.2.1" geoutils = "0.5.1" -grenad = { git = "https://github.com/meilisearch/grenad", branch = "parallel-sorter", default-features = false, features = [ - "rayon", - "tempfile", +grenad = { version = "0.4.5", default-features = false, features = [ + "rayon", "tempfile" ] } heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.7", default-features = false, features = [ "lmdb", "read-txn-no-tls" From 4d864f0702578e6540207c1472992fab06d63b15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 2 Nov 2023 14:47:43 +0100 Subject: [PATCH 092/127] Always sort internal Sorter entries in parallel --- .../index_documents/helpers/grenad_helpers.rs | 1 + milli/src/update/index_documents/transform.rs | 51 ++++++------------- 2 files changed, 17 insertions(+), 35 deletions(-) diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index cc0ccb609..03a3d6f5f 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -47,6 +47,7 @@ pub fn create_sorter( builder.allow_realloc(false); } builder.sort_algorithm(sort_algorithm); + builder.sort_in_parallel(true); builder.build() } diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 8d1750c49..23b5c78c1 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -114,43 +114,24 @@ impl<'a, 'i> Transform<'a, 'i> { }; // We initialize the sorter with the user indexing settings. - let original_sorter = { - let mut builder = grenad::Sorter::builder(merge_function); - builder.chunk_compression_type(indexer_settings.chunk_compression_type); - if let Some(level) = indexer_settings.chunk_compression_level { - builder.chunk_compression_level(level); - } - if let Some(nb_chunks) = indexer_settings.max_nb_chunks { - builder.max_nb_chunks(nb_chunks); - } - if let Some(memory) = indexer_settings.max_memory.map(|mem| mem / 2) { - builder.dump_threshold(memory); - builder.allow_realloc(false); - } - builder.sort_algorithm(grenad::SortAlgorithm::Stable); - builder.sort_in_parallel(true); - builder.build() - }; + let original_sorter = create_sorter( + grenad::SortAlgorithm::Stable, + merge_function, + indexer_settings.chunk_compression_type, + indexer_settings.chunk_compression_level, + indexer_settings.max_nb_chunks, + indexer_settings.max_memory.map(|mem| mem / 2), + ); // We initialize the sorter with the user indexing settings. - let flattened_sorter = { - let mut builder = grenad::Sorter::builder(merge_function); - builder.chunk_compression_type(indexer_settings.chunk_compression_type); - if let Some(level) = indexer_settings.chunk_compression_level { - builder.chunk_compression_level(level); - } - if let Some(nb_chunks) = indexer_settings.max_nb_chunks { - builder.max_nb_chunks(nb_chunks); - } - if let Some(memory) = indexer_settings.max_memory.map(|mem| mem / 2) { - builder.dump_threshold(memory); - builder.allow_realloc(false); - } - builder.sort_algorithm(grenad::SortAlgorithm::Stable); - builder.sort_in_parallel(true); - builder.build() - }; - + let flattened_sorter = create_sorter( + grenad::SortAlgorithm::Stable, + merge_function, + indexer_settings.chunk_compression_type, + indexer_settings.chunk_compression_level, + indexer_settings.max_nb_chunks, + indexer_settings.max_memory.map(|mem| mem / 2), + ); let documents_ids = index.documents_ids(wtxn)?; Ok(Transform { From 12323d610e33b1f0dcdaa97ddc90c5b59b599417 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 31 Oct 2023 16:46:16 +0100 Subject: [PATCH 093/127] Change the original document sorter key from the internal docid to a concatenation of the internal and the external docid --- milli/src/update/index_documents/mod.rs | 2 + milli/src/update/index_documents/transform.rs | 116 ++++++++++-------- 2 files changed, 69 insertions(+), 49 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index c32f907b2..129b67cf0 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1387,6 +1387,8 @@ mod tests { index.add_documents(documents!({ "a" : { "b" : { "c" : 1 }}})).unwrap(); let rtxn = index.read_txn().unwrap(); + let all_documents_count = index.all_documents(&rtxn).unwrap().count(); + assert_eq!(all_documents_count, 1); let external_documents_ids = index.external_documents_ids(); assert!(external_documents_ids.get(&rtxn, "1").unwrap().is_some()); } diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 23b5c78c1..3863d5a54 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -174,7 +174,8 @@ impl<'a, 'i> Transform<'a, 'i> { self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?; let mut obkv_buffer = Vec::new(); - let mut document_sorter_buffer = Vec::new(); + let mut document_sorter_value_buffer = Vec::new(); + let mut document_sorter_key_buffer = Vec::new(); let mut documents_count = 0; let mut docid_buffer: Vec = Vec::new(); let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new(); @@ -268,57 +269,64 @@ impl<'a, 'i> Transform<'a, 'i> { // we associate the base document with the new key, everything will get merged later. let keep_original_version = self.index_documents_method == IndexDocumentsMethod::UpdateDocuments; - document_sorter_buffer.clear(); - document_sorter_buffer.push(Operation::Addition as u8); + document_sorter_key_buffer.clear(); + document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); + document_sorter_key_buffer.extend_from_slice(external_id.as_bytes()); + document_sorter_value_buffer.clear(); + document_sorter_value_buffer.push(Operation::Addition as u8); into_del_add_obkv( KvReaderU16::new(base_obkv), true, keep_original_version, - &mut document_sorter_buffer, + &mut document_sorter_value_buffer, )?; - self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?; + self.original_sorter.insert(&document_sorter_key_buffer, &document_sorter_buffer)?; let base_obkv = KvReader::new(base_obkv); if let Some(flattened_obkv) = self.flatten_from_fields_ids_map(base_obkv)? { // we recreate our buffer with the flattened documents - document_sorter_buffer.clear(); - document_sorter_buffer.push(Operation::Addition as u8); + document_sorter_value_buffer.clear(); + document_sorter_value_buffer.push(Operation::Addition as u8); into_del_add_obkv( KvReaderU16::new(&flattened_obkv), true, keep_original_version, - &mut document_sorter_buffer, + &mut document_sorter_value_buffer, )?; } - self.flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?; + self.flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?; } } if !skip_insertion { self.new_documents_ids.insert(docid); - document_sorter_buffer.clear(); - document_sorter_buffer.push(Operation::Addition as u8); + document_sorter_key_buffer.clear(); + document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); + document_sorter_key_buffer.extend_from_slice(external_id.as_bytes()); + document_sorter_value_buffer.clear(); + document_sorter_value_buffer.push(Operation::Addition as u8); into_del_add_obkv( KvReaderU16::new(&obkv_buffer), false, true, - &mut document_sorter_buffer, + &mut document_sorter_value_buffer, )?; // We use the extracted/generated user id as the key for this document. - self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?; + self.original_sorter + .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; let flattened_obkv = KvReader::new(&obkv_buffer); if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? { - document_sorter_buffer.clear(); - document_sorter_buffer.push(Operation::Addition as u8); + document_sorter_value_buffer.clear(); + document_sorter_value_buffer.push(Operation::Addition as u8); into_del_add_obkv( KvReaderU16::new(&obkv), false, true, - &mut document_sorter_buffer, + &mut document_sorter_value_buffer, )? } - self.flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?; + self.flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?; } documents_count += 1; @@ -372,37 +380,42 @@ impl<'a, 'i> Transform<'a, 'i> { let external_documents_ids = self.index.external_documents_ids(); let mut documents_deleted = 0; - let mut document_sorter_buffer = Vec::new(); + let mut document_sorter_value_buffer = Vec::new(); + let mut document_sorter_key_buffer = Vec::new(); for to_remove in to_remove { if should_abort() { return Err(Error::InternalError(InternalError::AbortedIndexation)); } // Check if the document has been added in the current indexing process. - let deleted_from_current = match self - .new_external_documents_ids_builder - .entry((*to_remove).into()) - { - // if the document was added in a previous iteration of the transform we make it as deleted in the sorters. - HEntry::Occupied(entry) => { - let doc_id = *entry.get() as u32; - document_sorter_buffer.clear(); - document_sorter_buffer.push(Operation::Deletion as u8); - obkv::KvWriterU16::new(&mut document_sorter_buffer).finish().unwrap(); - self.original_sorter.insert(doc_id.to_be_bytes(), &document_sorter_buffer)?; - self.flattened_sorter.insert(doc_id.to_be_bytes(), &document_sorter_buffer)?; + let deleted_from_current = + match self.new_external_documents_ids_builder.entry((*to_remove).into()) { + // if the document was added in a previous iteration of the transform we make it as deleted in the sorters. + HEntry::Occupied(entry) => { + let docid = *entry.get() as u32; + // Key is the concatenation of the internal docid and the external one. + document_sorter_key_buffer.clear(); + document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); + document_sorter_key_buffer.extend_from_slice(to_remove.as_bytes()); + document_sorter_value_buffer.clear(); + document_sorter_value_buffer.push(Operation::Deletion as u8); + obkv::KvWriterU16::new(&mut document_sorter_value_buffer).finish().unwrap(); + self.original_sorter + .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; + self.flattened_sorter + .insert(docid.to_be_bytes(), &document_sorter_value_buffer)?; - // we must NOT update the list of replaced_documents_ids - // Either: - // 1. It's already in it and there is nothing to do - // 2. It wasn't in it because the document was created by a previous batch and since - // we're removing it there is nothing to do. - self.new_documents_ids.remove(doc_id); - entry.remove_entry(); - true - } - HEntry::Vacant(_) => false, - }; + // we must NOT update the list of replaced_documents_ids + // Either: + // 1. It's already in it and there is nothing to do + // 2. It wasn't in it because the document was created by a previous batch and since + // we're removing it there is nothing to do. + self.new_documents_ids.remove(docid); + entry.remove_entry(); + true + } + HEntry::Vacant(_) => false, + }; // If the document was already in the db we mark it as a `to_delete` document. // Then we push the document in sorters in deletion mode. @@ -422,31 +435,36 @@ impl<'a, 'i> Transform<'a, 'i> { key: None, })?; + // Key is the concatenation of the internal docid and the external one. + document_sorter_key_buffer.clear(); + document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); + document_sorter_key_buffer.extend_from_slice(to_remove.as_bytes()); // push it as to delete in the original_sorter - document_sorter_buffer.clear(); - document_sorter_buffer.push(Operation::Deletion as u8); + document_sorter_value_buffer.clear(); + document_sorter_value_buffer.push(Operation::Deletion as u8); into_del_add_obkv( KvReaderU16::new(base_obkv), true, false, - &mut document_sorter_buffer, + &mut document_sorter_value_buffer, )?; - self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?; + self.original_sorter + .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; // flatten it and push it as to delete in the flattened_sorter let flattened_obkv = KvReader::new(base_obkv); if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? { // we recreate our buffer with the flattened documents - document_sorter_buffer.clear(); - document_sorter_buffer.push(Operation::Deletion as u8); + document_sorter_value_buffer.clear(); + document_sorter_value_buffer.push(Operation::Deletion as u8); into_del_add_obkv( KvReaderU16::new(&obkv), true, false, - &mut document_sorter_buffer, + &mut document_sorter_value_buffer, )?; } - self.flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?; + self.flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?; true } From 4b64c33aa2525a8fc79e7a318ec2566c867e5f66 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 31 Oct 2023 17:44:42 +0100 Subject: [PATCH 094/127] update vector extractor --- .../extract/extract_vector_points.rs | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 863bc07c3..9aed862ab 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -7,7 +7,8 @@ use serde_json::{from_slice, Value}; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; use crate::error::UserError; -use crate::{FieldId, InternalError, Result, VectorOrArrayOfVectors}; +use crate::update::index_documents::helpers::try_split_at; +use crate::{DocumentId, FieldId, InternalError, Result, VectorOrArrayOfVectors}; /// Extracts the embedding vector contained in each document under the `_vectors` field. /// @@ -28,15 +29,17 @@ pub fn extract_vector_points( ); let mut cursor = obkv_documents.into_cursor()?; - while let Some((docid_bytes, value)) = cursor.move_on_next()? { + while let Some((key, value)) = cursor.move_on_next()? { + // this must always be serialized as (docid, external_docid); + let (docid_bytes, external_id_bytes) = + try_split_at(key, std::mem::size_of::()).unwrap(); + debug_assert!(std::str::from_utf8(external_id_bytes).is_ok()); + let obkv = obkv::KvReader::new(value); // since we only needs the primary key when we throw an error we create this getter to // lazily get it when needed - let document_id = || -> Value { - let document_id = obkv.get(primary_key_id).unwrap(); - from_slice(document_id).unwrap() - }; + let document_id = || -> Value { std::str::from_utf8(external_id_bytes).unwrap().into() }; // first we retrieve the _vectors field if let Some(vectors) = obkv.get(vectors_fid) { From 1b4ff991c03ec56455a8d18f72e657c63ccd07a1 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 31 Oct 2023 17:44:57 +0100 Subject: [PATCH 095/127] update typed chunks --- milli/src/external_documents_ids.rs | 4 -- .../extract/extract_vector_points.rs | 1 - .../src/update/index_documents/extract/mod.rs | 9 +-- .../src/update/index_documents/typed_chunk.rs | 62 +++++++------------ 4 files changed, 22 insertions(+), 54 deletions(-) diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs index 1bf08396a..ee8d29ffc 100644 --- a/milli/src/external_documents_ids.rs +++ b/milli/src/external_documents_ids.rs @@ -74,10 +74,6 @@ impl ExternalDocumentsIds { for DocumentOperation { external_id, internal_id, kind } in operations { match kind { DocumentOperationKind::Create => { - // TODO should we get before insert to be able to detect bugs? - // if matches!(kind, DocumentOperationKind::Create) { - // panic!("Attempting to create an already-existing document"); - // } self.0.put(wtxn, &external_id, &BEU32::new(internal_id))?; } DocumentOperationKind::Delete => { diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 9aed862ab..1f5edeeeb 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -17,7 +17,6 @@ use crate::{DocumentId, FieldId, InternalError, Result, VectorOrArrayOfVectors}; pub fn extract_vector_points( obkv_documents: grenad::Reader, indexer: GrenadParameters, - primary_key_id: FieldId, vectors_fid: FieldId, ) -> Result>> { puffin::profile_function!(); diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 41722a53e..ee8713ee8 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -63,7 +63,6 @@ pub(crate) fn data_from_obkv_documents( indexer, lmdb_writer_sx.clone(), vectors_field_id, - primary_key_id, ) }) .collect::>()?; @@ -274,7 +273,6 @@ fn send_original_documents_data( indexer: GrenadParameters, lmdb_writer_sx: Sender>, vectors_field_id: Option, - primary_key_id: FieldId, ) -> Result<()> { let original_documents_chunk = original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; @@ -283,12 +281,7 @@ fn send_original_documents_data( let documents_chunk_cloned = original_documents_chunk.clone(); let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); rayon::spawn(move || { - let result = extract_vector_points( - documents_chunk_cloned, - indexer, - primary_key_id, - vectors_field_id, - ); + let result = extract_vector_points(documents_chunk_cloned, indexer, vectors_field_id); let _ = match result { Ok(vector_points) => { lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points))) diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 1b38be03b..7c3f587d2 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -17,6 +17,7 @@ use crate::distance::NDotProductPoint; use crate::error::UserError; use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind}; use crate::facet::FacetType; +use crate::index::db_name::DOCUMENTS; use crate::index::Hnsw; use crate::update::del_add::{DelAdd, KvReaderDelAdd}; use crate::update::facet::FacetsUpdate; @@ -24,7 +25,7 @@ use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_arr use crate::update::index_documents::validate_document_id_value; use crate::{ lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, InternalError, - Result, BEU32, + Result, SerializationError, BEU32, }; pub(crate) enum TypedChunk { @@ -124,13 +125,15 @@ pub(crate) fn write_typed_chunk_into_index( let mut operations: Vec = Default::default(); let mut docids = index.documents_ids(wtxn)?; - let primary_key = index.primary_key(wtxn)?.unwrap(); - let primary_key = index.fields_ids_map(wtxn)?.id(primary_key).unwrap(); let mut cursor = obkv_documents_iter.into_cursor()?; - while let Some((docid, reader)) = cursor.move_on_next()? { + while let Some((key, reader)) = cursor.move_on_next()? { let mut writer: KvWriter<_, FieldId> = KvWriter::memory(); let reader: KvReader = KvReader::new(reader); - let docid = docid.try_into().map(DocumentId::from_be_bytes).unwrap(); + + let (document_id_bytes, external_id_bytes) = try_split_array_at(key) + .ok_or(SerializationError::Decoding { db_name: Some(DOCUMENTS) })?; + let docid = DocumentId::from_be_bytes(document_id_bytes); + let external_id = std::str::from_utf8(external_id_bytes)?; for (field_id, value) in reader.iter() { let del_add_reader = KvReaderDelAdd::new(value); @@ -140,45 +143,10 @@ pub(crate) fn write_typed_chunk_into_index( ) { (None, None) => {} (None, Some(value)) => { - // if primary key, new document - if field_id == primary_key { - // FIXME: we already extracted the external docid before. We should retrieve it in the typed chunk - // rather than re-extract it here - // FIXME: unwraps - let document_id = serde_json::from_slice(value) - .map_err(InternalError::SerdeJson) - .unwrap(); - let external_id = - validate_document_id_value(document_id).unwrap().unwrap(); - operations.push(DocumentOperation { - external_id, - internal_id: docid, - kind: DocumentOperationKind::Create, - }); - docids.insert(docid); - } // anyway, write writer.insert(field_id, value)?; } - (Some(value), None) => { - // if primary key, deleted document - if field_id == primary_key { - // FIXME: we already extracted the external docid before. We should retrieve it in the typed chunk - // rather than re-extract it here - // FIXME: unwraps - let document_id = serde_json::from_slice(value) - .map_err(InternalError::SerdeJson) - .unwrap(); - let external_id = - validate_document_id_value(document_id).unwrap().unwrap(); - operations.push(DocumentOperation { - external_id, - internal_id: docid, - kind: DocumentOperationKind::Delete, - }); - docids.remove(docid); - } - } + (Some(_), None) => {} (Some(_), Some(value)) => { // updated field, write writer.insert(field_id, value)?; @@ -190,8 +158,20 @@ pub(crate) fn write_typed_chunk_into_index( if !writer.is_empty() { db.put(wtxn, &BEU32::new(docid), &writer.into_inner().unwrap())?; + operations.push(DocumentOperation { + external_id: external_id.to_string(), + internal_id: docid, + kind: DocumentOperationKind::Create, + }); + docids.insert(docid); } else { db.delete(wtxn, &BEU32::new(docid))?; + operations.push(DocumentOperation { + external_id: external_id.to_string(), + internal_id: docid, + kind: DocumentOperationKind::Delete, + }); + docids.remove(docid); } } let external_documents_docids = index.external_documents_ids(); From bc51d6157adceee09f529789e66f66f9d08d8fac Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 2 Nov 2023 13:37:54 +0100 Subject: [PATCH 096/127] Fix transform reindexing path --- milli/src/external_documents_ids.rs | 2 +- milli/src/update/index_documents/transform.rs | 82 +++++++++++++------ 2 files changed, 56 insertions(+), 28 deletions(-) diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs index ee8d29ffc..a002fc064 100644 --- a/milli/src/external_documents_ids.rs +++ b/milli/src/external_documents_ids.rs @@ -18,7 +18,7 @@ pub struct DocumentOperation { pub kind: DocumentOperationKind, } -pub struct ExternalDocumentsIds(Database>); +pub struct ExternalDocumentsIds(pub Database>); impl ExternalDocumentsIds { pub fn new(db: Database>) -> ExternalDocumentsIds { diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 3863d5a54..82cf55d42 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -14,14 +14,15 @@ use serde_json::Value; use smartstring::SmartString; use super::helpers::{ - create_sorter, create_writer, obkvs_keep_last_addition_merge_deletions, - obkvs_merge_additions_and_deletions, MergeFn, + create_sorter, create_writer, keep_first, obkvs_keep_last_addition_merge_deletions, + obkvs_merge_additions_and_deletions, sorter_into_reader, MergeFn, }; use super::{IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::{db_name, main_key}; use crate::update::del_add::{into_del_add_obkv, DelAdd, KvReaderDelAdd}; +use crate::update::index_documents::GrenadParameters; use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep}; use crate::{ FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, BEU32, @@ -772,24 +773,35 @@ impl<'a, 'i> Transform<'a, 'i> { let documents_ids = self.index.documents_ids(wtxn)?; let documents_count = documents_ids.len() as usize; - // We create a final writer to write the new documents in order from the sorter. - let mut original_writer = create_writer( + // We initialize the sorter with the user indexing settings. + let mut original_sorter = create_sorter( + grenad::SortAlgorithm::Stable, + keep_first, self.indexer_settings.chunk_compression_type, self.indexer_settings.chunk_compression_level, - tempfile::tempfile()?, + self.indexer_settings.max_nb_chunks, + self.indexer_settings.max_memory.map(|mem| mem / 2), ); - // We create a final writer to write the new documents in order from the sorter. - let mut flattened_writer = create_writer( + // We initialize the sorter with the user indexing settings. + let mut flattened_sorter = create_sorter( + grenad::SortAlgorithm::Stable, + keep_first, self.indexer_settings.chunk_compression_type, self.indexer_settings.chunk_compression_level, - tempfile::tempfile()?, + self.indexer_settings.max_nb_chunks, + self.indexer_settings.max_memory.map(|mem| mem / 2), ); let mut obkv_buffer = Vec::new(); - let mut document_sorter_buffer = Vec::new(); - for result in self.index.all_documents(wtxn)? { - let (docid, obkv) = result?; + let mut document_sorter_key_buffer = Vec::new(); + let mut document_sorter_value_buffer = Vec::new(); + for result in self.index.external_documents_ids().0.iter(wtxn)? { + let (external_id, docid) = result?; + let obkv = self.index.documents.get(wtxn, &docid)?.ok_or( + InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, + )?; + let docid = docid.get(); obkv_buffer.clear(); let mut obkv_writer = KvWriter::<_, FieldId>::new(&mut obkv_buffer); @@ -802,9 +814,18 @@ impl<'a, 'i> Transform<'a, 'i> { } let buffer = obkv_writer.into_inner()?; - document_sorter_buffer.clear(); - into_del_add_obkv(KvReaderU16::new(buffer), false, true, &mut document_sorter_buffer)?; - original_writer.insert(docid.to_be_bytes(), &document_sorter_buffer)?; + + document_sorter_key_buffer.clear(); + document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); + document_sorter_key_buffer.extend_from_slice(external_id.as_bytes()); + document_sorter_value_buffer.clear(); + into_del_add_obkv( + KvReaderU16::new(buffer), + false, + true, + &mut document_sorter_value_buffer, + )?; + original_sorter.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; // Once we have the document. We're going to flatten it // and insert it in the flattened sorter. @@ -839,18 +860,27 @@ impl<'a, 'i> Transform<'a, 'i> { let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?; writer.insert(fid, &value)?; } - document_sorter_buffer.clear(); - into_del_add_obkv(KvReaderU16::new(&buffer), false, true, &mut document_sorter_buffer)?; - flattened_writer.insert(docid.to_be_bytes(), &document_sorter_buffer)?; + document_sorter_value_buffer.clear(); + into_del_add_obkv( + KvReaderU16::new(&buffer), + false, + true, + &mut document_sorter_value_buffer, + )?; + flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?; } - // Once we have written all the documents, we extract - // the file and reset the seek to be able to read it again. - let mut original_documents = original_writer.into_inner()?; - original_documents.rewind()?; + let grenad_params = GrenadParameters { + chunk_compression_type: self.indexer_settings.chunk_compression_type, + chunk_compression_level: self.indexer_settings.chunk_compression_level, + max_memory: self.indexer_settings.max_memory, + max_nb_chunks: self.indexer_settings.max_nb_chunks, // default value, may be chosen. + }; - let mut flattened_documents = flattened_writer.into_inner()?; - flattened_documents.rewind()?; + // Once we have written all the documents, we merge everything into a Reader. + let original_documents = sorter_into_reader(original_sorter, grenad_params)?; + + let flattened_documents = sorter_into_reader(flattened_sorter, grenad_params)?; let output = TransformOutput { primary_key, @@ -862,10 +892,8 @@ impl<'a, 'i> Transform<'a, 'i> { // FIXME: remove this now unused field replaced_documents_ids: RoaringBitmap::default(), documents_count, - original_documents: original_documents.into_inner().map_err(|err| err.into_error())?, - flattened_documents: flattened_documents - .into_inner() - .map_err(|err| err.into_error())?, + original_documents: original_documents.into_inner().into_inner(), + flattened_documents: flattened_documents.into_inner().into_inner(), }; let new_facets = output.compute_real_facets(wtxn, self.index)?; From 5b20e625f3a9cf543a47e88a70446de55a463844 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 2 Nov 2023 15:31:37 +0100 Subject: [PATCH 097/127] fix merge --- milli/src/update/index_documents/transform.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 82cf55d42..2eec69da5 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -281,7 +281,8 @@ impl<'a, 'i> Transform<'a, 'i> { keep_original_version, &mut document_sorter_value_buffer, )?; - self.original_sorter.insert(&document_sorter_key_buffer, &document_sorter_buffer)?; + self.original_sorter + .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; let base_obkv = KvReader::new(base_obkv); if let Some(flattened_obkv) = self.flatten_from_fields_ids_map(base_obkv)? { // we recreate our buffer with the flattened documents @@ -294,7 +295,8 @@ impl<'a, 'i> Transform<'a, 'i> { &mut document_sorter_value_buffer, )?; } - self.flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?; + self.flattened_sorter + .insert(docid.to_be_bytes(), &document_sorter_value_buffer)?; } } @@ -465,7 +467,8 @@ impl<'a, 'i> Transform<'a, 'i> { &mut document_sorter_value_buffer, )?; } - self.flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?; + self.flattened_sorter + .insert(docid.to_be_bytes(), &document_sorter_value_buffer)?; true } From bf0651f23cdb7decc9b3c7fe31dbc9fc21e429be Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 2 Nov 2023 15:37:05 +0100 Subject: [PATCH 098/127] Implement iter method on ExternalDocumentsIds --- milli/src/external_documents_ids.rs | 7 ++++++- milli/src/update/index_documents/transform.rs | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs index a002fc064..e0a71b7cd 100644 --- a/milli/src/external_documents_ids.rs +++ b/milli/src/external_documents_ids.rs @@ -18,7 +18,7 @@ pub struct DocumentOperation { pub kind: DocumentOperationKind, } -pub struct ExternalDocumentsIds(pub Database>); +pub struct ExternalDocumentsIds(Database>); impl ExternalDocumentsIds { pub fn new(db: Database>) -> ExternalDocumentsIds { @@ -86,6 +86,11 @@ impl ExternalDocumentsIds { Ok(()) } + + /// Returns an iterator over all the external ids. + pub fn iter<'t>(&self, rtxn: &'t RoTxn) -> heed::Result>> { + self.0.iter(rtxn) + } } /// An iterator over mappings between requested internal ids and external ids. diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 2eec69da5..7c500799d 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -799,7 +799,7 @@ impl<'a, 'i> Transform<'a, 'i> { let mut obkv_buffer = Vec::new(); let mut document_sorter_key_buffer = Vec::new(); let mut document_sorter_value_buffer = Vec::new(); - for result in self.index.external_documents_ids().0.iter(wtxn)? { + for result in self.index.external_documents_ids().iter(wtxn)? { let (external_id, docid) = result?; let obkv = self.index.documents.get(wtxn, &docid)?.ok_or( InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, From ff522c919d5065499fa01bdf2c21747bade40e60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 2 Nov 2023 15:58:08 +0100 Subject: [PATCH 099/127] Fix the vector extractions for the diff indexing --- .../extract/extract_vector_points.rs | 127 ++++++++++++++---- .../src/update/index_documents/typed_chunk.rs | 81 +++++++---- 2 files changed, 156 insertions(+), 52 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 1f5edeeeb..317a9aec3 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -1,15 +1,25 @@ +use std::cmp::Ordering; use std::convert::TryFrom; use std::fs::File; -use std::io::{self, BufReader}; +use std::io::{self, BufReader, BufWriter}; +use std::mem::size_of; +use std::str::from_utf8; use bytemuck::cast_slice; +use grenad::Writer; +use itertools::EitherOrBoth; +use ordered_float::OrderedFloat; use serde_json::{from_slice, Value}; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; use crate::error::UserError; +use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::index_documents::helpers::try_split_at; use crate::{DocumentId, FieldId, InternalError, Result, VectorOrArrayOfVectors}; +/// The length of the elements that are always in the buffer when inserting new values. +const TRUNCATE_SIZE: usize = size_of::(); + /// Extracts the embedding vector contained in each document under the `_vectors` field. /// /// Returns the generated grenad reader containing the docid as key associated to the Vec @@ -27,45 +37,112 @@ pub fn extract_vector_points( tempfile::tempfile()?, ); + let mut key_buffer = Vec::new(); let mut cursor = obkv_documents.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { // this must always be serialized as (docid, external_docid); let (docid_bytes, external_id_bytes) = try_split_at(key, std::mem::size_of::()).unwrap(); - debug_assert!(std::str::from_utf8(external_id_bytes).is_ok()); + debug_assert!(from_utf8(external_id_bytes).is_ok()); let obkv = obkv::KvReader::new(value); + key_buffer.clear(); + key_buffer.extend_from_slice(docid_bytes); // since we only needs the primary key when we throw an error we create this getter to // lazily get it when needed - let document_id = || -> Value { std::str::from_utf8(external_id_bytes).unwrap().into() }; + let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() }; // first we retrieve the _vectors field - if let Some(vectors) = obkv.get(vectors_fid) { - // extract the vectors - let vectors = match from_slice(vectors) { - Ok(vectors) => VectorOrArrayOfVectors::into_array_of_vectors(vectors), - Err(_) => { - return Err(UserError::InvalidVectorsType { - document_id: document_id(), - value: from_slice(vectors).map_err(InternalError::SerdeJson)?, - } - .into()) - } - }; + if let Some(value) = obkv.get(vectors_fid) { + let vectors_obkv = KvReaderDelAdd::new(value); - if let Some(vectors) = vectors { - for (i, vector) in vectors.into_iter().enumerate().take(u16::MAX as usize) { - let index = u16::try_from(i).unwrap(); - let mut key = docid_bytes.to_vec(); - key.extend_from_slice(&index.to_be_bytes()); - let bytes = cast_slice(&vector); - writer.insert(key, bytes)?; - } - } + // then we extract the values + let del_vectors = vectors_obkv + .get(DelAdd::Deletion) + .map(|vectors| extract_vectors(vectors, document_id)) + .transpose()? + .flatten(); + let add_vectors = vectors_obkv + .get(DelAdd::Addition) + .map(|vectors| extract_vectors(vectors, document_id)) + .transpose()? + .flatten(); + + // and we finally push the unique vectors into the writer + push_vectors_diff( + &mut writer, + &mut key_buffer, + del_vectors.unwrap_or_default(), + add_vectors.unwrap_or_default(), + )?; } - // else => the `_vectors` object was `null`, there is nothing to do } writer_into_reader(writer) } + +/// Computes the diff between both Del and Add numbers and +/// only inserts the parts that differ in the sorter. +fn push_vectors_diff( + writer: &mut Writer>, + key_buffer: &mut Vec, + mut del_vectors: Vec>, + mut add_vectors: Vec>, +) -> Result<()> { + // We sort and dedup the vectors + del_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); + add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); + del_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); + add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); + + let merged_vectors_iter = + itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add)); + + // insert vectors into the writer + for (i, eob) in merged_vectors_iter.into_iter().enumerate().take(u16::MAX as usize) { + // Generate the key by extending the unique index to it. + key_buffer.truncate(TRUNCATE_SIZE); + let index = u16::try_from(i).unwrap(); + key_buffer.extend_from_slice(&index.to_be_bytes()); + + match eob { + EitherOrBoth::Both(_, _) => (), // no need to touch anything + EitherOrBoth::Left(vector) => { + // We insert only the Del part of the Obkv to inform + // that we only want to remove all those vectors. + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Deletion, cast_slice(&vector))?; + let bytes = obkv.into_inner()?; + writer.insert(&key_buffer, bytes)?; + } + EitherOrBoth::Right(vector) => { + // We insert only the Add part of the Obkv to inform + // that we only want to remove all those vectors. + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Addition, cast_slice(&vector))?; + let bytes = obkv.into_inner()?; + writer.insert(&key_buffer, bytes)?; + } + } + } + + Ok(()) +} + +/// Compares two vectors by using the OrderingFloat helper. +fn compare_vectors(a: &[f32], b: &[f32]) -> Ordering { + a.iter().copied().map(OrderedFloat).cmp(b.iter().copied().map(OrderedFloat)) +} + +/// Extracts the vectors from a JSON value. +fn extract_vectors(value: &[u8], document_id: impl Fn() -> Value) -> Result>>> { + match from_slice(value) { + Ok(vectors) => Ok(VectorOrArrayOfVectors::into_array_of_vectors(vectors)), + Err(_) => Err(UserError::InvalidVectorsType { + document_id: document_id(), + value: from_slice(value).map_err(InternalError::SerdeJson)?, + } + .into()), + } +} diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 7c3f587d2..80671e39f 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::convert::TryInto; use std::fs::File; use std::io::{self, BufReader}; @@ -8,7 +8,9 @@ use charabia::{Language, Script}; use grenad::MergerBuilder; use heed::types::ByteSlice; use heed::RwTxn; +use log::error; use obkv::{KvReader, KvWriter}; +use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use super::helpers::{self, merge_ignore_values, valid_lmdb_key, CursorClonableMmap}; @@ -22,10 +24,9 @@ use crate::index::Hnsw; use crate::update::del_add::{DelAdd, KvReaderDelAdd}; use crate::update::facet::FacetsUpdate; use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at}; -use crate::update::index_documents::validate_document_id_value; use crate::{ - lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, InternalError, - Result, SerializationError, BEU32, + lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, Result, + SerializationError, BEU32, }; pub(crate) enum TypedChunk { @@ -366,44 +367,70 @@ pub(crate) fn write_typed_chunk_into_index( index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?; } TypedChunk::VectorPoints(vector_points) => { - let (pids, mut points): (Vec<_>, Vec<_>) = match index.vector_hnsw(wtxn)? { - Some(hnsw) => hnsw.iter().map(|(pid, point)| (pid, point.clone())).unzip(), - None => Default::default(), - }; - - // Convert the PointIds into DocumentIds - let mut docids = Vec::new(); - for pid in pids { - let docid = - index.vector_id_docid.get(wtxn, &BEU32::new(pid.into_inner()))?.unwrap(); - docids.push(docid.get()); + let mut vectors_set = HashSet::new(); + // We extract and store the previous vectors + if let Some(hnsw) = index.vector_hnsw(wtxn)? { + for (pid, point) in hnsw.iter() { + let pid_key = BEU32::new(pid.into_inner()); + let docid = index.vector_id_docid.get(wtxn, &pid_key)?.unwrap().get(); + let vector: Vec<_> = point.iter().copied().map(OrderedFloat).collect(); + vectors_set.insert((docid, vector)); + } } - let mut expected_dimensions = points.get(0).map(|p| p.len()); let mut cursor = vector_points.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { // convert the key back to a u32 (4 bytes) let (left, _index) = try_split_array_at(key).unwrap(); let docid = DocumentId::from_be_bytes(left); - // convert the vector back to a Vec - let vector: Vec = pod_collect_to_vec(value); - // TODO Inform the user about the document that has a wrong `_vectors` - let found = vector.len(); - let expected = *expected_dimensions.get_or_insert(found); - if expected != found { - return Err(UserError::InvalidVectorDimensions { expected, found }.into()); + let vector_deladd_obkv = KvReaderDelAdd::new(value); + if let Some(value) = vector_deladd_obkv.get(DelAdd::Deletion) { + // convert the vector back to a Vec + let vector = pod_collect_to_vec(value).into_iter().map(OrderedFloat).collect(); + let key = (docid, vector); + if !vectors_set.remove(&key) { + error!("Unable to delete the vector: {:?}", key.1); + } + } + if let Some(value) = vector_deladd_obkv.get(DelAdd::Addition) { + // convert the vector back to a Vec + let vector = pod_collect_to_vec(value).into_iter().map(OrderedFloat).collect(); + vectors_set.insert((docid, vector)); } - - points.push(NDotProductPoint::new(vector)); - docids.push(docid); } - assert_eq!(docids.len(), points.len()); + // Extract the most common vector dimension + let expected_dimension_size = { + let mut dims = HashMap::new(); + vectors_set.iter().for_each(|(_, v)| *dims.entry(v.len()).or_insert(0) += 1); + dims.into_iter().max_by_key(|(_, count)| *count).map(|(len, _)| len) + }; + + // Ensure that the vector lenghts are correct and + // prepare the vectors before inserting them in the HNSW. + let mut points = Vec::new(); + let mut docids = Vec::new(); + for (docid, vector) in vectors_set { + if expected_dimension_size.map_or(false, |expected| expected != vector.len()) { + return Err(UserError::InvalidVectorDimensions { + expected: expected_dimension_size.unwrap_or(vector.len()), + found: vector.len(), + } + .into()); + } else { + let vector = vector.into_iter().map(OrderedFloat::into_inner).collect(); + points.push(NDotProductPoint::new(vector)); + docids.push(docid); + } + } let hnsw_length = points.len(); let (new_hnsw, pids) = Hnsw::builder().build_hnsw(points); + assert_eq!(docids.len(), pids.len()); + + // Store the vectors in the point-docid relation database index.vector_id_docid.clear(wtxn)?; for (docid, pid) in docids.into_iter().zip(pids) { index.vector_id_docid.put( From 87610a5f988ac59c786feff1cd1fd019ccf67366 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 2 Nov 2023 16:49:03 +0100 Subject: [PATCH 100/127] Don't try to delete a document that is not in the database --- .../update/index_documents/helpers/grenad_helpers.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index 03a3d6f5f..4f764ab95 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -223,11 +223,13 @@ pub fn grenad_obkv_into_chunks( ); while let Some((document_id, obkv)) = cursor.move_on_next()? { - obkv_documents.insert(document_id, obkv)?; - current_chunk_size += document_id.len() as u64 + obkv.len() as u64; + if !obkv.is_empty() { + obkv_documents.insert(document_id, obkv)?; + current_chunk_size += document_id.len() as u64 + obkv.len() as u64; - if current_chunk_size >= documents_chunk_size as u64 { - return writer_into_reader(obkv_documents).map(Some); + if current_chunk_size >= documents_chunk_size as u64 { + return writer_into_reader(obkv_documents).map(Some); + } } } From 1ad1fcc8c83863d4ff261d62787bf14cddc0c78a Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 6 Nov 2023 10:31:14 +0100 Subject: [PATCH 101/127] Remove all warnings --- .../extract/extract_docid_word_positions.rs | 4 ++-- milli/src/update/index_documents/extract/mod.rs | 2 +- milli/src/update/index_documents/mod.rs | 3 --- milli/src/update/index_documents/transform.rs | 12 ------------ 4 files changed, 3 insertions(+), 18 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 96156adb4..9895c1a64 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -30,7 +30,7 @@ pub fn extract_docid_word_positions( allowed_separators: Option<&[&str]>, dictionary: Option<&[&str]>, max_positions_per_attributes: Option, -) -> Result<(RoaringBitmap, grenad::Reader>, ScriptLanguageDocidsMap)> { +) -> Result<(grenad::Reader>, ScriptLanguageDocidsMap)> { puffin::profile_function!(); let max_positions_per_attributes = max_positions_per_attributes @@ -154,7 +154,7 @@ pub fn extract_docid_word_positions( // the returned sorter is serialized as: key: (DocId, FieldId), value: KV>. sorter_into_reader(docid_word_positions_sorter, indexer) - .map(|reader| (documents_ids, reader, script_language_docids)) + .map(|reader| (reader, script_language_docids)) } /// Check if any searchable fields of a document changed. diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index ee8713ee8..91f3e1c62 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -348,7 +348,7 @@ fn send_and_extract_flattened_documents_data( let (docid_word_positions_chunk, fid_docid_facet_values_chunks): (Result<_>, Result<_>) = rayon::join( || { - let (documents_ids, docid_word_positions_chunk, script_language_pair) = + let (docid_word_positions_chunk, script_language_pair) = extract_docid_word_positions( flattened_documents_chunk.clone(), indexer, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 129b67cf0..66e6d16dc 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -245,9 +245,6 @@ where primary_key, fields_ids_map, field_distribution, - new_external_documents_ids, - new_documents_ids, - replaced_documents_ids, documents_count, original_documents, flattened_documents, diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 7c500799d..186974bfe 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -32,9 +32,6 @@ pub struct TransformOutput { pub primary_key: String, pub fields_ids_map: FieldsIdsMap, pub field_distribution: FieldDistribution, - pub new_external_documents_ids: fst::Map>, - pub new_documents_ids: RoaringBitmap, - pub replaced_documents_ids: RoaringBitmap, pub documents_count: usize, pub original_documents: File, pub flattened_documents: File, @@ -735,15 +732,11 @@ impl<'a, 'i> Transform<'a, 'i> { new_external_documents_ids_builder.into_iter().try_for_each(|(key, value)| { fst_new_external_documents_ids_builder.insert(key, value) })?; - let new_external_documents_ids = fst_new_external_documents_ids_builder.into_map(); Ok(TransformOutput { primary_key, fields_ids_map: self.fields_ids_map, field_distribution, - new_external_documents_ids: new_external_documents_ids.map_data(Cow::Owned).unwrap(), - new_documents_ids: self.new_documents_ids, - replaced_documents_ids: self.replaced_documents_ids, documents_count: self.documents_count, original_documents: original_documents.into_inner().map_err(|err| err.into_error())?, flattened_documents: flattened_documents @@ -889,11 +882,6 @@ impl<'a, 'i> Transform<'a, 'i> { primary_key, fields_ids_map: new_fields_ids_map, field_distribution, - // FIXME: remove this now unused field - new_external_documents_ids: fst::Map::default().map_data(Cow::Owned).unwrap(), - new_documents_ids: documents_ids, - // FIXME: remove this now unused field - replaced_documents_ids: RoaringBitmap::default(), documents_count, original_documents: original_documents.into_inner().into_inner(), flattened_documents: flattened_documents.into_inner().into_inner(), From 1b2ea6cf19309782a2e3b2ff2fe6d7708dd5de4f Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 6 Nov 2023 10:46:22 +0100 Subject: [PATCH 102/127] REVERT ME: ignore prefix pair databases tests --- milli/src/update/prefix_word_pairs/mod.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs index e718f9b77..320c01461 100644 --- a/milli/src/update/prefix_word_pairs/mod.rs +++ b/milli/src/update/prefix_word_pairs/mod.rs @@ -171,6 +171,7 @@ mod tests { documents } + #[ignore] #[test] fn add_new_documents() { let mut index = TempIndex::new(); @@ -235,6 +236,7 @@ mod tests { db_snap!(index, word_prefix_pair_proximity_docids, "update"); db_snap!(index, prefix_word_pair_proximity_docids, "update"); } + #[ignore] #[test] fn batch_bug_3043() { // https://github.com/meilisearch/meilisearch/issues/3043 @@ -283,6 +285,7 @@ mod tests { db_snap!(index, prefix_word_pair_proximity_docids); } + #[ignore] #[test] fn hard_delete_and_reupdate() { let mut index = TempIndex::new(); @@ -357,6 +360,7 @@ mod tests { db_snap!(index, prefix_word_pair_proximity_docids, "reupdate"); } + #[ignore] #[test] fn replace_hard_deletion() { let mut index = TempIndex::new(); From 1bccf2079ed6c92669f7100dbebeb79d23668956 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 6 Nov 2023 11:03:56 +0100 Subject: [PATCH 103/127] Correctly mark non-tests as non-tests --- milli/src/search/facet/facet_sort_ascending.rs | 4 ++-- milli/src/update/facet/mod.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index 892401c08..0197639e4 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -13,7 +13,7 @@ use crate::heed_codec::ByteSliceRefCodec; /// The documents returned by the iterator are grouped by the facet values that /// determined their rank. For example, given the documents: /// -/// ```ignore +/// ```text /// 0: { "colour": ["blue", "green"] } /// 1: { "colour": ["blue", "red"] } /// 2: { "colour": ["orange", "red"] } @@ -22,7 +22,7 @@ use crate::heed_codec::ByteSliceRefCodec; /// ``` /// Then calling the function on the candidates `[0, 2, 3, 4]` will return an iterator /// over the following elements: -/// ```ignore +/// ```text /// [0, 4] // corresponds to all the documents within the candidates that have the facet value "blue" /// [3] // same for "green" /// [2] // same for "orange" diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 7358ceb6c..52fea0f5f 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -14,7 +14,7 @@ The databases must be able to return results for queries such as: The algorithms that implement these queries are found in the `src/search/facet` folder. To make these queries fast to compute, the database adopts a tree structure: -```ignore +```text ┌───────────────────────────────┬───────────────────────────────┬───────────────┐ ┌───────┐ │ "ab" (2) │ "gaf" (2) │ "woz" (1) │ │Level 2│ │ │ │ │ @@ -41,7 +41,7 @@ These documents all contain a facet value that is contained within `ab .. gaf`. In the database, each node is represented by a key/value pair encoded as a [`FacetGroupKey`] and a [`FacetGroupValue`], which have the following format: -```ignore +```text FacetGroupKey: - field id : u16 - level : u8 From cbaa54cafdf8e91c798958b5335e125880b78a1d Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 6 Nov 2023 11:19:31 +0100 Subject: [PATCH 104/127] Fix clippy issues --- index-scheduler/src/batch.rs | 2 +- milli/src/external_documents_ids.rs | 5 ++--- milli/src/index.rs | 2 +- .../extract/extract_docid_word_positions.rs | 9 +++++---- .../extract/extract_fid_word_count_docids.rs | 2 +- .../index_documents/extract/extract_word_docids.rs | 12 ++++++------ .../extract/extract_word_pair_proximity_docids.rs | 9 ++++----- .../extract/extract_word_position_docids.rs | 2 +- .../index_documents/helpers/merge_functions.rs | 2 +- milli/src/update/index_documents/mod.rs | 2 +- milli/src/update/index_documents/typed_chunk.rs | 3 ++- 11 files changed, 25 insertions(+), 25 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index ebdba0a8c..d96891d82 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -1292,7 +1292,7 @@ impl IndexScheduler { || must_stop_processing.get(), )?; - let document_ids = documents.iter().cloned().flatten().collect(); + let document_ids = documents.iter().flatten().cloned().collect(); let (new_builder, user_result) = builder.remove_documents(document_ids)?; builder = new_builder; diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs index e0a71b7cd..0e4891649 100644 --- a/milli/src/external_documents_ids.rs +++ b/milli/src/external_documents_ids.rs @@ -1,5 +1,4 @@ use std::collections::HashMap; -use std::convert::TryInto; use heed::types::{OwnedType, Str}; use heed::{Database, RoIter, RoTxn, RwTxn}; @@ -31,7 +30,7 @@ impl ExternalDocumentsIds { } pub fn get>(&self, rtxn: &RoTxn, external_id: A) -> heed::Result> { - Ok(self.0.get(rtxn, external_id.as_ref())?.map(|x| x.get().try_into().unwrap())) + Ok(self.0.get(rtxn, external_id.as_ref())?.map(|x| x.get())) } /// An helper function to debug this type, returns an `HashMap` of both, @@ -40,7 +39,7 @@ impl ExternalDocumentsIds { let mut map = HashMap::default(); for result in self.0.iter(rtxn)? { let (external, internal) = result?; - map.insert(external.to_owned(), internal.get().try_into().unwrap()); + map.insert(external.to_owned(), internal.get()); } Ok(map) } diff --git a/milli/src/index.rs b/milli/src/index.rs index a52033fb6..86ef6105b 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1376,7 +1376,7 @@ impl Index { rtxn: &RoTxn, key: &(Script, Language), ) -> heed::Result> { - Ok(self.script_language_docids.get(rtxn, key)?) + self.script_language_docids.get(rtxn, key) } pub fn script_language(&self, rtxn: &RoTxn) -> heed::Result>> { diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 9895c1a64..0dcd6a42a 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -198,7 +198,7 @@ fn tokenizer_builder<'a>( } if let Some(script_language) = script_language { - tokenizer_builder.allow_list(&script_language); + tokenizer_builder.allow_list(script_language); } tokenizer_builder @@ -206,6 +206,7 @@ fn tokenizer_builder<'a>( /// Extract words maped with their positions of a document, /// ensuring no Language detection mistakes was made. +#[allow(clippy::too_many_arguments)] // FIXME: consider grouping arguments in a struct fn lang_safe_tokens_from_document<'a>( obkv: &KvReader, searchable_fields: &Option>, @@ -220,9 +221,9 @@ fn lang_safe_tokens_from_document<'a>( let mut script_language_word_count = HashMap::new(); tokens_from_document( - &obkv, + obkv, searchable_fields, - &tokenizer, + tokenizer, max_positions_per_attributes, del_add, buffers, @@ -257,7 +258,7 @@ fn lang_safe_tokens_from_document<'a>( // rerun the extraction. tokens_from_document( - &obkv, + obkv, searchable_fields, &tokenizer, max_positions_per_attributes, diff --git a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs index accf4a510..182d0c5d8 100644 --- a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs +++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs @@ -45,7 +45,7 @@ pub fn extract_fid_word_count_docids( .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = u32::from_be_bytes(document_id_bytes); - let del_add_reader = KvReaderDelAdd::new(&value); + let del_add_reader = KvReaderDelAdd::new(value); let deletion = del_add_reader // get deleted words .get(DelAdd::Deletion) diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index 5266e9bff..f278012c7 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -57,17 +57,17 @@ pub fn extract_word_docids( let document_id = u32::from_be_bytes(document_id_bytes); let fid = u16::from_be_bytes(fid_bytes); - let del_add_reader = KvReaderDelAdd::new(&value); + let del_add_reader = KvReaderDelAdd::new(value); // extract all unique words to remove. if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) { - for (_pos, word) in KvReaderU16::new(&deletion).iter() { + for (_pos, word) in KvReaderU16::new(deletion).iter() { del_words.insert(word.to_vec()); } } // extract all unique additional words. if let Some(addition) = del_add_reader.get(DelAdd::Addition) { - for (_pos, word) in KvReaderU16::new(&addition).iter() { + for (_pos, word) in KvReaderU16::new(addition).iter() { add_words.insert(word.to_vec()); } } @@ -122,9 +122,9 @@ pub fn extract_word_docids( // every words contained in an attribute set to exact must be pushed in the exact_words list. if exact_attributes.contains(&fid) { - exact_word_docids_sorter.insert(word.as_bytes(), &value)?; + exact_word_docids_sorter.insert(word.as_bytes(), value)?; } else { - word_docids_sorter.insert(word.as_bytes(), &value)?; + word_docids_sorter.insert(word.as_bytes(), value)?; } } @@ -169,7 +169,7 @@ fn words_into_sorter( }; key_buffer.clear(); - key_buffer.extend_from_slice(&word_bytes); + key_buffer.extend_from_slice(word_bytes); key_buffer.push(0); key_buffer.extend_from_slice(&fid.to_be_bytes()); word_fid_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?; diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 76a1d1d68..b8a377247 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -29,7 +29,6 @@ pub fn extract_word_pair_proximity_docids( let max_memory = indexer.max_memory_by_thread(); let mut word_pair_proximity_docids_sorters: Vec<_> = (1..MAX_DISTANCE) - .into_iter() .map(|_| { create_sorter( grenad::SortAlgorithm::Unstable, @@ -75,7 +74,7 @@ pub fn extract_word_pair_proximity_docids( let (del, add): (Result<_>, Result<_>) = rayon::join( || { // deletions - if let Some(deletion) = KvReaderDelAdd::new(&value).get(DelAdd::Deletion) { + if let Some(deletion) = KvReaderDelAdd::new(value).get(DelAdd::Deletion) { for (position, word) in KvReaderU16::new(deletion).iter() { // drain the proximity window until the head word is considered close to the word we are inserting. while del_word_positions.get(0).map_or(false, |(_w, p)| { @@ -104,7 +103,7 @@ pub fn extract_word_pair_proximity_docids( }, || { // additions - if let Some(addition) = KvReaderDelAdd::new(&value).get(DelAdd::Addition) { + if let Some(addition) = KvReaderDelAdd::new(value).get(DelAdd::Addition) { for (position, word) in KvReaderU16::new(addition).iter() { // drain the proximity window until the head word is considered close to the word we are inserting. while add_word_positions.get(0).map_or(false, |(_w, p)| { @@ -170,7 +169,7 @@ fn document_word_positions_into_sorter( document_id: DocumentId, del_word_pair_proximity: &BTreeMap<(String, String), u8>, add_word_pair_proximity: &BTreeMap<(String, String), u8>, - word_pair_proximity_docids_sorters: &mut Vec>, + word_pair_proximity_docids_sorters: &mut [grenad::Sorter], ) -> Result<()> { use itertools::merge_join_by; use itertools::EitherOrBoth::{Both, Left, Right}; @@ -201,7 +200,7 @@ fn document_word_positions_into_sorter( }; key_buffer.clear(); - key_buffer.push(*prox as u8); + key_buffer.push(*prox); key_buffer.extend_from_slice(w1.as_bytes()); key_buffer.push(0); key_buffer.extend_from_slice(w2.as_bytes()); diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index 2ff2f2ad5..1b9ec66ff 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -60,7 +60,7 @@ pub fn extract_word_position_docids( current_document_id = Some(document_id); - let del_add_reader = KvReaderDelAdd::new(&value); + let del_add_reader = KvReaderDelAdd::new(value); // extract all unique words to remove. if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) { for (position, word_bytes) in KvReaderU16::new(deletion).iter() { diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs index 770629c8e..98c1c1a04 100644 --- a/milli/src/update/index_documents/helpers/merge_functions.rs +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -157,7 +157,7 @@ fn inner_merge_del_add_obkvs<'a>( let mut acc = newest[1..].to_vec(); let mut buffer = Vec::new(); // reverse iter from the most recent to the oldest. - for current in obkvs.into_iter().rev() { + for current in obkvs.iter().rev() { // if in the previous iteration there was a complete deletion, // stop the merge process. if acc_operation_type == Operation::Deletion as u8 { diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 66e6d16dc..2be410ace 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -2659,7 +2659,7 @@ mod tests { let external_document_ids = index.external_documents_ids(); let ids_to_delete: Vec = external_ids .iter() - .map(|id| external_document_ids.get(&wtxn, id).unwrap().unwrap()) + .map(|id| external_document_ids.get(wtxn, id).unwrap().unwrap()) .collect(); // Delete some documents. diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 80671e39f..b53d859cd 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -456,7 +456,7 @@ pub(crate) fn write_typed_chunk_into_index( if final_value.is_empty() { // If the database entry exists, delete it. - if db_key_exists == true { + if db_key_exists { index.script_language_docids.delete(wtxn, &key)?; } } else { @@ -501,6 +501,7 @@ fn merge_word_docids_reader_into_fst( /// /// If there is no Add side we currently write an empty buffer /// which is a valid CboRoaringBitmap. +#[allow(clippy::ptr_arg)] // required to avoid signature mismatch fn deladd_serialize_add_side<'a>(obkv: &'a [u8], _buffer: &mut Vec) -> Result<&'a [u8]> { Ok(KvReaderDelAdd::new(obkv).get(DelAdd::Addition).unwrap_or_default()) } From 620fee35f98db38715ad9be6ad54c15b13d692a3 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 6 Nov 2023 11:56:46 +0100 Subject: [PATCH 105/127] Fix benches --- benchmarks/benches/indexing.rs | 89 +++++++++++++--------------------- 1 file changed, 34 insertions(+), 55 deletions(-) diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index cb220a5f0..c31bfab89 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -264,17 +264,7 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) { (index, document_ids_to_delete) }, move |(index, document_ids_to_delete)| { - let mut wtxn = index.write_txn().unwrap(); - - for ids in document_ids_to_delete { - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.delete_documents(&ids); - builder.execute().unwrap(); - } - - wtxn.commit().unwrap(); - - index.prepare_for_closing().wait(); + delete_documents_from_ids(index, document_ids_to_delete) }, ) }); @@ -611,17 +601,7 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) { (index, document_ids_to_delete) }, move |(index, document_ids_to_delete)| { - let mut wtxn = index.write_txn().unwrap(); - - for ids in document_ids_to_delete { - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.delete_documents(&ids); - builder.execute().unwrap(); - } - - wtxn.commit().unwrap(); - - index.prepare_for_closing().wait(); + delete_documents_from_ids(index, document_ids_to_delete) }, ) }); @@ -873,22 +853,41 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) { (index, document_ids_to_delete) }, move |(index, document_ids_to_delete)| { - let mut wtxn = index.write_txn().unwrap(); - - for ids in document_ids_to_delete { - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.delete_documents(&ids); - builder.execute().unwrap(); - } - - wtxn.commit().unwrap(); - - index.prepare_for_closing().wait(); + delete_documents_from_ids(index, document_ids_to_delete) }, ) }); } +fn delete_documents_from_ids(index: Index, document_ids_to_delete: Vec) { + let mut wtxn = index.write_txn().unwrap(); + + let indexer_config = IndexerConfig::default(); + for ids in document_ids_to_delete { + let external_documents_ids = index.external_documents_ids(); + // FIXME: for filters matching a lot of documents, this will allocate a huge vec of external docids (strings). + // Since what we have is an iterator, it would be better to delete in chunks + let external_to_internal: std::result::Result, RoaringBitmap> = + external_documents_ids + .find_external_id_of(&wtxn, ids) + .unwrap() + .only_external_ids() + .collect(); + let ids = external_to_internal.unwrap(); + let config = IndexDocumentsConfig::default(); + + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &indexer_config, config, |_| (), || false) + .unwrap(); + (builder, _) = builder.remove_documents(ids).unwrap(); + builder.execute().unwrap(); + } + + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); +} + fn indexing_movies_in_three_batches(c: &mut Criterion) { let mut group = c.benchmark_group("indexing"); group.sample_size(BENCHMARK_ITERATION); @@ -1110,17 +1109,7 @@ fn deleting_nested_movies_in_batches_default(c: &mut Criterion) { (index, document_ids_to_delete) }, move |(index, document_ids_to_delete)| { - let mut wtxn = index.write_txn().unwrap(); - - for ids in document_ids_to_delete { - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.delete_documents(&ids); - builder.execute().unwrap(); - } - - wtxn.commit().unwrap(); - - index.prepare_for_closing().wait(); + delete_documents_from_ids(index, document_ids_to_delete) }, ) }); @@ -1336,17 +1325,7 @@ fn deleting_geo_in_batches_default(c: &mut Criterion) { (index, document_ids_to_delete) }, move |(index, document_ids_to_delete)| { - let mut wtxn = index.write_txn().unwrap(); - - for ids in document_ids_to_delete { - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.delete_documents(&ids); - builder.execute().unwrap(); - } - - wtxn.commit().unwrap(); - - index.prepare_for_closing().wait(); + delete_documents_from_ids(index, document_ids_to_delete) }, ) }); From ef6fa10f7a93e8aedd36feb3dce327bd6f896636 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 6 Nov 2023 12:16:15 +0100 Subject: [PATCH 106/127] Remove `IndexOperation::DocumentDeletion` --- index-scheduler/src/batch.rs | 65 +++++++----------------------------- 1 file changed, 12 insertions(+), 53 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index d96891d82..c9deedb37 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -104,12 +104,6 @@ pub(crate) enum IndexOperation { operations: Vec, tasks: Vec, }, - DocumentDeletion { - index_uid: String, - // The vec associated with each document deletion tasks. - documents: Vec>, - tasks: Vec, - }, IndexDocumentDeletionByFilter { index_uid: String, task: Task, @@ -161,7 +155,6 @@ impl Batch { } Batch::IndexOperation { op, .. } => match op { IndexOperation::DocumentOperation { tasks, .. } - | IndexOperation::DocumentDeletion { tasks, .. } | IndexOperation::Settings { tasks, .. } | IndexOperation::DocumentClear { tasks, .. } => { tasks.iter().map(|task| task.uid).collect() @@ -226,7 +219,6 @@ impl IndexOperation { pub fn index_uid(&self) -> &str { match self { IndexOperation::DocumentOperation { index_uid, .. } - | IndexOperation::DocumentDeletion { index_uid, .. } | IndexOperation::IndexDocumentDeletionByFilter { index_uid, .. } | IndexOperation::DocumentClear { index_uid, .. } | IndexOperation::Settings { index_uid, .. } @@ -242,9 +234,6 @@ impl fmt::Display for IndexOperation { IndexOperation::DocumentOperation { .. } => { f.write_str("IndexOperation::DocumentOperation") } - IndexOperation::DocumentDeletion { .. } => { - f.write_str("IndexOperation::DocumentDeletion") - } IndexOperation::IndexDocumentDeletionByFilter { .. } => { f.write_str("IndexOperation::IndexDocumentDeletionByFilter") } @@ -347,18 +336,27 @@ impl IndexScheduler { BatchKind::DocumentDeletion { deletion_ids } => { let tasks = self.get_existing_tasks(rtxn, deletion_ids)?; - let mut documents = Vec::new(); + let mut operations = Vec::with_capacity(tasks.len()); + let mut documents_counts = Vec::with_capacity(tasks.len()); for task in &tasks { match task.kind { KindWithContent::DocumentDeletion { ref documents_ids, .. } => { - documents.push(documents_ids.clone()) + operations.push(DocumentOperation::Delete(documents_ids.clone())); + documents_counts.push(documents_ids.len() as u64); } _ => unreachable!(), } } Ok(Some(Batch::IndexOperation { - op: IndexOperation::DocumentDeletion { index_uid, documents, tasks }, + op: IndexOperation::DocumentOperation { + index_uid, + primary_key: None, + method: IndexDocumentsMethod::ReplaceDocuments, + documents_counts, + operations, + tasks, + }, must_create_index, })) } @@ -1275,45 +1273,6 @@ impl IndexScheduler { Ok(tasks) } - IndexOperation::DocumentDeletion { index_uid: _, documents, mut tasks } => { - let indexer_config = self.index_mapper.indexer_config(); - let config = IndexDocumentsConfig { - update_method: IndexDocumentsMethod::ReplaceDocuments, - ..Default::default() - }; - let must_stop_processing = self.must_stop_processing.clone(); - - let mut builder = milli::update::IndexDocuments::new( - index_wtxn, - index, - indexer_config, - config, - |indexing_step| trace!("update: {:?}", indexing_step), - || must_stop_processing.get(), - )?; - - let document_ids = documents.iter().flatten().cloned().collect(); - - let (new_builder, user_result) = builder.remove_documents(document_ids)?; - builder = new_builder; - // Uses Invariant: remove documents actually always returns Ok for the inner result - let count = user_result.unwrap(); - - for (task, documents) in tasks.iter_mut().zip(documents) { - task.status = Status::Succeeded; - task.details = Some(Details::DocumentDeletion { - provided_ids: documents.len(), - deleted_documents: Some(count.min(documents.len() as u64)), - }); - } - - if !tasks.iter().all(|res| res.error.is_some()) { - let addition = builder.execute()?; - info!("document deletion done: {:?}", addition); - } - - Ok(tasks) - } IndexOperation::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => { let filter = if let KindWithContent::DocumentDeletionByFilter { filter_expr, .. } = From 1e2fbc6a421e5dbea38e40c04d4511f28a5b7ea0 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 7 Nov 2023 16:46:52 +0100 Subject: [PATCH 107/127] revert "REVERT ME: ignore prefix pair databases tests" This reverts commit 1b2ea6cf19309782a2e3b2ff2fe6d7708dd5de4f. --- milli/src/update/prefix_word_pairs/mod.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs index 320c01461..e718f9b77 100644 --- a/milli/src/update/prefix_word_pairs/mod.rs +++ b/milli/src/update/prefix_word_pairs/mod.rs @@ -171,7 +171,6 @@ mod tests { documents } - #[ignore] #[test] fn add_new_documents() { let mut index = TempIndex::new(); @@ -236,7 +235,6 @@ mod tests { db_snap!(index, word_prefix_pair_proximity_docids, "update"); db_snap!(index, prefix_word_pair_proximity_docids, "update"); } - #[ignore] #[test] fn batch_bug_3043() { // https://github.com/meilisearch/meilisearch/issues/3043 @@ -285,7 +283,6 @@ mod tests { db_snap!(index, prefix_word_pair_proximity_docids); } - #[ignore] #[test] fn hard_delete_and_reupdate() { let mut index = TempIndex::new(); @@ -360,7 +357,6 @@ mod tests { db_snap!(index, prefix_word_pair_proximity_docids, "reupdate"); } - #[ignore] #[test] fn replace_hard_deletion() { let mut index = TempIndex::new(); From 6dab826908b815c5e62e63f8848f22cf7196e5de Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 8 Nov 2023 11:52:08 +0100 Subject: [PATCH 108/127] Reactivate prefix databases --- milli/src/update/index_documents/mod.rs | 56 ++++++++++++++++++++----- 1 file changed, 46 insertions(+), 10 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 2be410ace..f5fbe2797 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -381,12 +381,48 @@ where total_databases: TOTAL_POSTING_DATABASE_COUNT, }); + let mut word_pair_proximity_docids = None; + let mut word_position_docids = None; + let mut word_fid_docids = None; + let mut word_docids = None; + let mut exact_word_docids = None; + for result in lmdb_writer_rx { if (self.should_abort)() { return Err(Error::InternalError(InternalError::AbortedIndexation)); } - let typed_chunk = result?; + let typed_chunk = match result? { + TypedChunk::WordDocids { + word_docids_reader, + exact_word_docids_reader, + word_fid_docids_reader, + } => { + let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? }; + word_docids = Some(cloneable_chunk); + let cloneable_chunk = + unsafe { as_cloneable_grenad(&exact_word_docids_reader)? }; + exact_word_docids = Some(cloneable_chunk); + let cloneable_chunk = unsafe { as_cloneable_grenad(&word_fid_docids_reader)? }; + word_fid_docids = Some(cloneable_chunk); + TypedChunk::WordDocids { + word_docids_reader, + exact_word_docids_reader, + word_fid_docids_reader, + } + } + TypedChunk::WordPairProximityDocids(chunk) => { + let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; + word_pair_proximity_docids = Some(cloneable_chunk); + TypedChunk::WordPairProximityDocids(chunk) + } + TypedChunk::WordPositionDocids(chunk) => { + let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; + word_position_docids = Some(cloneable_chunk); + TypedChunk::WordPositionDocids(chunk) + } + otherwise => otherwise, + }; // FIXME: return newly added as well as newly deleted documents let (docids, is_merged_database) = @@ -417,17 +453,17 @@ where // We write the primary key field id into the main database self.index.put_primary_key(self.wtxn, &primary_key)?; + let number_of_documents = self.index.number_of_documents(self.wtxn)?; - // TODO: reactivate prefix DB with diff-indexing - // self.execute_prefix_databases( - // word_docids, - // exact_word_docids, - // word_pair_proximity_docids, - // word_position_docids, - // word_fid_docids, - // )?; + self.execute_prefix_databases( + word_docids, + exact_word_docids, + word_pair_proximity_docids, + word_position_docids, + word_fid_docids, + )?; - self.index.number_of_documents(self.wtxn) + Ok(number_of_documents) } #[logging_timer::time("IndexDocuments::{}")] From 688266c83e59d4c311b070fe8d274ac071cafae3 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 8 Nov 2023 14:16:01 +0100 Subject: [PATCH 109/127] Remove word pair proximity prefix cache and compute it at search time --- milli/src/index.rs | 14 +- milli/src/search/new/db_cache.rs | 62 +- milli/src/update/clear_documents.rs | 5 - milli/src/update/index_documents/mod.rs | 39 +- milli/src/update/mod.rs | 5 - milli/src/update/prefix_word_pairs/mod.rs | 418 ---------- .../update/prefix_word_pairs/prefix_word.rs | 182 ----- .../update/prefix_word_pairs/word_prefix.rs | 728 ------------------ 8 files changed, 41 insertions(+), 1412 deletions(-) delete mode 100644 milli/src/update/prefix_word_pairs/mod.rs delete mode 100644 milli/src/update/prefix_word_pairs/prefix_word.rs delete mode 100644 milli/src/update/prefix_word_pairs/word_prefix.rs diff --git a/milli/src/index.rs b/milli/src/index.rs index 86ef6105b..5023d8fa5 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -83,8 +83,6 @@ pub mod db_name { pub const EXTERNAL_DOCUMENTS_IDS: &str = "external-documents-ids"; pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions"; pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids"; - pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids"; - pub const PREFIX_WORD_PAIR_PROXIMITY_DOCIDS: &str = "prefix-word-pair-proximity-docids"; pub const WORD_POSITION_DOCIDS: &str = "word-position-docids"; pub const WORD_FIELD_ID_DOCIDS: &str = "word-field-id-docids"; pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids"; @@ -129,10 +127,6 @@ pub struct Index { /// Maps the proximity between a pair of words with all the docids where this relation appears. pub word_pair_proximity_docids: Database, - /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears. - pub word_prefix_pair_proximity_docids: Database, - /// Maps the proximity between a pair of prefix and word with all the docids where this relation appears. - pub prefix_word_pair_proximity_docids: Database, /// Maps the word and the position with the docids that corresponds to it. pub word_position_docids: Database, @@ -186,7 +180,7 @@ impl Index { ) -> Result { use db_name::*; - options.max_dbs(26); + options.max_dbs(24); unsafe { options.flag(Flags::MdbAlwaysFreePages) }; let env = options.open(path)?; @@ -203,10 +197,6 @@ impl Index { env.create_database(&mut wtxn, Some(WORD_PAIR_PROXIMITY_DOCIDS))?; let script_language_docids = env.create_database(&mut wtxn, Some(SCRIPT_LANGUAGE_DOCIDS))?; - let word_prefix_pair_proximity_docids = - env.create_database(&mut wtxn, Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?; - let prefix_word_pair_proximity_docids = - env.create_database(&mut wtxn, Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?; let word_position_docids = env.create_database(&mut wtxn, Some(WORD_POSITION_DOCIDS))?; let word_fid_docids = env.create_database(&mut wtxn, Some(WORD_FIELD_ID_DOCIDS))?; let field_id_word_count_docids = @@ -247,8 +237,6 @@ impl Index { exact_word_prefix_docids, word_pair_proximity_docids, script_language_docids, - word_prefix_pair_proximity_docids, - prefix_word_pair_proximity_docids, word_position_docids, word_fid_docids, word_prefix_position_docids, diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index 3376cebb2..2c670658d 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -11,7 +11,9 @@ use super::interner::Interned; use super::Word; use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec}; use crate::update::{merge_cbo_roaring_bitmaps, MergeFn}; -use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext}; +use crate::{ + CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext, U8StrStrCodec, +}; /// A cache storing pointers to values in the LMDB databases. /// @@ -23,7 +25,7 @@ pub struct DatabaseCache<'ctx> { pub word_pair_proximity_docids: FxHashMap<(u8, Interned, Interned), Option>>, pub word_prefix_pair_proximity_docids: - FxHashMap<(u8, Interned, Interned), Option>>, + FxHashMap<(u8, Interned, Interned), Option>, pub prefix_word_pair_proximity_docids: FxHashMap<(u8, Interned, Interned), Option>>, pub word_docids: FxHashMap, Option>>, @@ -295,35 +297,47 @@ impl<'ctx> SearchContext<'ctx> { prefix2: Interned, proximity: u8, ) -> Result> { - DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( - self.txn, - (proximity, word1, prefix2), - &( - proximity, - self.word_interner.get(word1).as_str(), - self.word_interner.get(prefix2).as_str(), - ), - &mut self.db_cache.word_prefix_pair_proximity_docids, - self.index.word_prefix_pair_proximity_docids.remap_data_type::(), - ) + let docids = match self + .db_cache + .word_prefix_pair_proximity_docids + .entry((proximity, word1, prefix2)) + { + Entry::Occupied(docids) => docids.get().clone(), + Entry::Vacant(entry) => { + // compute docids using prefix iter and store the result in the cache. + let key = U8StrStrCodec::bytes_encode(&( + proximity, + self.word_interner.get(word1).as_str(), + self.word_interner.get(prefix2).as_str(), + )) + .unwrap() + .into_owned(); + let mut prefix_docids = RoaringBitmap::new(); + let remap_key_type = self + .index + .word_pair_proximity_docids + .remap_key_type::() + .prefix_iter(self.txn, &key)?; + for result in remap_key_type { + let (_, docids) = result?; + + prefix_docids |= docids; + } + entry.insert(Some(prefix_docids.clone())); + Some(prefix_docids) + } + }; + Ok(docids) } + pub fn get_db_prefix_word_pair_proximity_docids( &mut self, left_prefix: Interned, right: Interned, proximity: u8, ) -> Result> { - DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( - self.txn, - (proximity, left_prefix, right), - &( - proximity, - self.word_interner.get(left_prefix).as_str(), - self.word_interner.get(right).as_str(), - ), - &mut self.db_cache.prefix_word_pair_proximity_docids, - self.index.prefix_word_pair_proximity_docids.remap_data_type::(), - ) + // only accept exact matches on reverted positions + self.get_db_word_pair_proximity_docids(left_prefix, right, proximity) } pub fn get_db_word_fid_docids( diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 265c6f15a..afe0191b1 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -26,8 +26,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_prefix_docids, exact_word_prefix_docids, word_pair_proximity_docids, - word_prefix_pair_proximity_docids, - prefix_word_pair_proximity_docids, word_position_docids, word_fid_docids, field_id_word_count_docids, @@ -68,8 +66,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_prefix_docids.clear(self.wtxn)?; exact_word_prefix_docids.clear(self.wtxn)?; word_pair_proximity_docids.clear(self.wtxn)?; - word_prefix_pair_proximity_docids.clear(self.wtxn)?; - prefix_word_pair_proximity_docids.clear(self.wtxn)?; word_position_docids.clear(self.wtxn)?; word_fid_docids.clear(self.wtxn)?; field_id_word_count_docids.clear(self.wtxn)?; @@ -132,7 +128,6 @@ mod tests { assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap()); assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap()); assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap()); - assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap()); assert!(index.facet_id_f64_docids.is_empty(&rtxn).unwrap()); assert!(index.facet_id_string_docids.is_empty(&rtxn).unwrap()); assert!(index.field_id_docid_facet_f64s.is_empty(&rtxn).unwrap()); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index f5fbe2797..8552cf52b 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -35,13 +35,12 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ - IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, WordPrefixDocids, - WordPrefixIntegerDocids, WordsPrefixesFst, + IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst, }; use crate::{CboRoaringBitmapCodec, Index, Result}; static MERGED_DATABASE_COUNT: usize = 7; -static PREFIX_DATABASE_COUNT: usize = 5; +static PREFIX_DATABASE_COUNT: usize = 4; static TOTAL_POSTING_DATABASE_COUNT: usize = MERGED_DATABASE_COUNT + PREFIX_DATABASE_COUNT; #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -381,7 +380,6 @@ where total_databases: TOTAL_POSTING_DATABASE_COUNT, }); - let mut word_pair_proximity_docids = None; let mut word_position_docids = None; let mut word_fid_docids = None; let mut word_docids = None; @@ -411,11 +409,6 @@ where word_fid_docids_reader, } } - TypedChunk::WordPairProximityDocids(chunk) => { - let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; - word_pair_proximity_docids = Some(cloneable_chunk); - TypedChunk::WordPairProximityDocids(chunk) - } TypedChunk::WordPositionDocids(chunk) => { let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; word_position_docids = Some(cloneable_chunk); @@ -458,7 +451,6 @@ where self.execute_prefix_databases( word_docids, exact_word_docids, - word_pair_proximity_docids, word_position_docids, word_fid_docids, )?; @@ -471,7 +463,6 @@ where self, word_docids: Option>, exact_word_docids: Option>, - word_pair_proximity_docids: Option>, word_position_docids: Option>, word_fid_docids: Option>, ) -> Result<()> @@ -592,32 +583,6 @@ where total_databases: TOTAL_POSTING_DATABASE_COUNT, }); - if let Some(word_pair_proximity_docids) = word_pair_proximity_docids { - // Run the word prefix pair proximity docids update operation. - PrefixWordPairsProximityDocids::new( - self.wtxn, - self.index, - self.indexer_config.chunk_compression_type, - self.indexer_config.chunk_compression_level, - ) - .execute( - word_pair_proximity_docids, - &new_prefix_fst_words, - &common_prefix_fst_words, - &del_prefix_fst_words, - )?; - } - - if (self.should_abort)() { - return Err(Error::InternalError(InternalError::AbortedIndexation)); - } - - databases_seen += 1; - (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { - databases_seen, - total_databases: TOTAL_POSTING_DATABASE_COUNT, - }); - if let Some(word_position_docids) = word_position_docids { // Run the words prefix position docids update operation. let mut builder = WordPrefixIntegerDocids::new( diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index dd8851ccb..eb2b6e69a 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -8,10 +8,6 @@ pub use self::index_documents::{ MergeFn, }; pub use self::indexer_config::IndexerConfig; -pub use self::prefix_word_pairs::{ - PrefixWordPairsProximityDocids, MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB, - MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, -}; pub use self::settings::{Setting, Settings}; pub use self::update_step::UpdateIndexingStep; pub use self::word_prefix_docids::WordPrefixDocids; @@ -24,7 +20,6 @@ pub(crate) mod del_add; pub(crate) mod facet; mod index_documents; mod indexer_config; -mod prefix_word_pairs; mod settings; mod update_step; mod word_prefix_docids; diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs deleted file mode 100644 index e718f9b77..000000000 --- a/milli/src/update/prefix_word_pairs/mod.rs +++ /dev/null @@ -1,418 +0,0 @@ -use std::borrow::Cow; -use std::collections::HashSet; -use std::io::{BufReader, BufWriter}; - -use grenad::CompressionType; -use heed::types::ByteSlice; - -use super::index_documents::{merge_cbo_roaring_bitmaps, CursorClonableMmap}; -use crate::{Index, Result}; - -mod prefix_word; -mod word_prefix; - -pub use prefix_word::index_prefix_word_database; -pub use word_prefix::index_word_prefix_database; - -pub const MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB: u8 = 4; -pub const MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB: usize = 2; - -pub struct PrefixWordPairsProximityDocids<'t, 'u, 'i> { - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - max_proximity: u8, - max_prefix_length: usize, - chunk_compression_type: CompressionType, - chunk_compression_level: Option, -} -impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> { - pub fn new( - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - chunk_compression_type: CompressionType, - chunk_compression_level: Option, - ) -> Self { - Self { - wtxn, - index, - max_proximity: MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, - max_prefix_length: MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB, - chunk_compression_type, - chunk_compression_level, - } - } - - #[logging_timer::time("WordPrefixPairProximityDocids::{}")] - pub fn execute<'a>( - self, - new_word_pair_proximity_docids: grenad::Reader, - new_prefix_fst_words: &'a [String], - common_prefix_fst_words: &[&'a [String]], - del_prefix_fst_words: &HashSet>, - ) -> Result<()> { - puffin::profile_function!(); - - index_word_prefix_database( - self.wtxn, - self.index.word_pair_proximity_docids, - self.index.word_prefix_pair_proximity_docids, - self.max_proximity, - self.max_prefix_length, - new_word_pair_proximity_docids.clone(), - new_prefix_fst_words, - common_prefix_fst_words, - del_prefix_fst_words, - self.chunk_compression_type, - self.chunk_compression_level, - )?; - - index_prefix_word_database( - self.wtxn, - self.index.word_pair_proximity_docids, - self.index.prefix_word_pair_proximity_docids, - self.max_proximity, - self.max_prefix_length, - new_word_pair_proximity_docids, - new_prefix_fst_words, - common_prefix_fst_words, - del_prefix_fst_words, - self.chunk_compression_type, - self.chunk_compression_level, - )?; - - Ok(()) - } -} - -// This is adapted from `sorter_into_lmdb_database` -pub fn insert_into_database( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, - new_key: &[u8], - new_value: &[u8], -) -> Result<()> { - let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?; - match iter.next().transpose()? { - Some((key, old_val)) if new_key == key => { - let val = - merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)]) - .map_err(|_| { - // TODO just wrap this error? - crate::error::InternalError::IndexingMergingKeys { - process: "get-put-merge", - } - })?; - // safety: we use the new_key, not the one from the database iterator, to avoid undefined behaviour - unsafe { iter.put_current(new_key, &val)? }; - } - _ => { - drop(iter); - database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?; - } - } - Ok(()) -} - -// This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`, -// but it uses `append` if the database is empty, and it assumes that the values in the -// writer don't conflict with values in the database. -pub fn write_into_lmdb_database_without_merging( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, - writer: grenad::Writer>, -) -> Result<()> { - let file = writer.into_inner()?.into_inner().map_err(|err| err.into_error())?; - let reader = grenad::Reader::new(BufReader::new(file))?; - if database.is_empty(wtxn)? { - let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; - let mut cursor = reader.into_cursor()?; - while let Some((k, v)) = cursor.move_on_next()? { - // safety: the key comes from the grenad reader, not the database - unsafe { out_iter.append(k, v)? }; - } - } else { - let mut cursor = reader.into_cursor()?; - while let Some((k, v)) = cursor.move_on_next()? { - database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; - } - } - Ok(()) -} - -#[cfg(test)] -mod tests { - use std::io::Cursor; - - use crate::db_snap; - use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; - use crate::index::tests::TempIndex; - use crate::update::IndexDocumentsMethod; - - fn documents_with_enough_different_words_for_prefixes( - prefixes: &[&str], - start_id: usize, - ) -> Vec { - let mut documents = Vec::new(); - let mut id = start_id; - for prefix in prefixes { - for i in 0..50 { - documents.push( - serde_json::json!({ - "id": id, - "text": format!("{prefix}{i:x}"), - }) - .as_object() - .unwrap() - .clone(), - ); - id += 1; - } - } - documents - } - - #[test] - fn add_new_documents() { - let mut index = TempIndex::new(); - index.index_documents_config.words_prefix_threshold = Some(50); - index.index_documents_config.autogenerate_docids = true; - - index - .update_settings(|settings| { - settings.set_searchable_fields(vec!["text".to_owned()]); - }) - .unwrap(); - - let batch_reader_from_documents = |documents| { - let mut builder = DocumentsBatchBuilder::new(Vec::new()); - for object in documents { - builder.append_json_object(&object).unwrap(); - } - DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() - }; - - let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"], 0); - // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database - documents.push( - serde_json::json!({ - "id": "9000", - "text": "At an amazing and beautiful house" - }) - .as_object() - .unwrap() - .clone(), - ); - documents.push( - serde_json::json!({ - "id": "9001", - "text": "The bell rings at 5 am" - }) - .as_object() - .unwrap() - .clone(), - ); - - let documents = batch_reader_from_documents(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, word_prefix_pair_proximity_docids, "initial"); - db_snap!(index, prefix_word_pair_proximity_docids, "initial"); - - let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"], 100); - documents.push( - serde_json::json!({ - "id": "9002", - "text": "At an extraordinary house" - }) - .as_object() - .unwrap() - .clone(), - ); - let documents = batch_reader_from_documents(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, word_pair_proximity_docids, "update"); - db_snap!(index, word_prefix_pair_proximity_docids, "update"); - db_snap!(index, prefix_word_pair_proximity_docids, "update"); - } - #[test] - fn batch_bug_3043() { - // https://github.com/meilisearch/meilisearch/issues/3043 - let mut index = TempIndex::new(); - index.index_documents_config.words_prefix_threshold = Some(50); - index.index_documents_config.autogenerate_docids = true; - - index - .update_settings(|settings| { - settings.set_searchable_fields(vec!["text".to_owned()]); - }) - .unwrap(); - - let batch_reader_from_documents = |documents| { - let mut builder = DocumentsBatchBuilder::new(Vec::new()); - for object in documents { - builder.append_json_object(&object).unwrap(); - } - DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() - }; - - let mut documents = documents_with_enough_different_words_for_prefixes(&["y"], 0); - // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database - documents.push( - serde_json::json!({ - "text": "x y" - }) - .as_object() - .unwrap() - .clone(), - ); - documents.push( - serde_json::json!({ - "text": "x a y" - }) - .as_object() - .unwrap() - .clone(), - ); - - let documents = batch_reader_from_documents(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, word_pair_proximity_docids); - db_snap!(index, word_prefix_pair_proximity_docids); - db_snap!(index, prefix_word_pair_proximity_docids); - } - - #[test] - fn hard_delete_and_reupdate() { - let mut index = TempIndex::new(); - index.index_documents_config.words_prefix_threshold = Some(50); - - index - .update_settings(|settings| { - settings.set_primary_key("id".to_owned()); - settings.set_searchable_fields(vec!["text".to_owned()]); - }) - .unwrap(); - - let batch_reader_from_documents = |documents| { - let mut builder = DocumentsBatchBuilder::new(Vec::new()); - for object in documents { - builder.append_json_object(&object).unwrap(); - } - DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() - }; - - let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); - // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database - documents.push( - serde_json::json!({ - "id": 9000, - "text": "At an amazing and beautiful house" - }) - .as_object() - .unwrap() - .clone(), - ); - documents.push( - serde_json::json!({ - "id": 9001, - "text": "The bell rings at 5 am" - }) - .as_object() - .unwrap() - .clone(), - ); - - let documents = batch_reader_from_documents(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, documents_ids, "initial"); - db_snap!(index, word_docids, "initial"); - db_snap!(index, word_prefix_pair_proximity_docids, "initial"); - db_snap!(index, prefix_word_pair_proximity_docids, "initial"); - - index.delete_document("9000"); - - db_snap!(index, documents_ids, "first_delete"); - db_snap!(index, word_docids, "first_delete"); - db_snap!(index, word_prefix_pair_proximity_docids, "first_delete"); - db_snap!(index, prefix_word_pair_proximity_docids, "first_delete"); - - index.delete_documents((0..50).map(|id| id.to_string()).collect()); - - db_snap!(index, documents_ids, "second_delete"); - db_snap!(index, word_docids, "second_delete"); - db_snap!(index, word_prefix_pair_proximity_docids, "second_delete"); - db_snap!(index, prefix_word_pair_proximity_docids, "second_delete"); - - let documents = documents_with_enough_different_words_for_prefixes(&["b"], 1000); - // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database - - index.add_documents(batch_reader_from_documents(documents)).unwrap(); - - db_snap!(index, documents_ids, "reupdate"); - db_snap!(index, word_docids, "reupdate"); - db_snap!(index, word_prefix_pair_proximity_docids, "reupdate"); - db_snap!(index, prefix_word_pair_proximity_docids, "reupdate"); - } - - #[test] - fn replace_hard_deletion() { - let mut index = TempIndex::new(); - index.index_documents_config.words_prefix_threshold = Some(50); - index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; - - index - .update_settings(|settings| { - settings.set_primary_key("id".to_owned()); - settings.set_searchable_fields(vec!["text".to_owned()]); - }) - .unwrap(); - - let batch_reader_from_documents = |documents| { - let mut builder = DocumentsBatchBuilder::new(Vec::new()); - for object in documents { - builder.append_json_object(&object).unwrap(); - } - DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() - }; - - let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); - // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database - documents.push( - serde_json::json!({ - "id": 9000, - "text": "At an amazing house" - }) - .as_object() - .unwrap() - .clone(), - ); - documents.push( - serde_json::json!({ - "id": 9001, - "text": "The bell rings" - }) - .as_object() - .unwrap() - .clone(), - ); - - let documents = batch_reader_from_documents(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, documents_ids, "initial"); - db_snap!(index, word_docids, "initial"); - db_snap!(index, word_prefix_pair_proximity_docids, "initial"); - db_snap!(index, prefix_word_pair_proximity_docids, "initial"); - - let documents = documents_with_enough_different_words_for_prefixes(&["b"], 0); - index.add_documents(batch_reader_from_documents(documents)).unwrap(); - - db_snap!(index, documents_ids, "replaced"); - db_snap!(index, word_docids, "replaced"); - db_snap!(index, word_prefix_pair_proximity_docids, "replaced"); - db_snap!(index, prefix_word_pair_proximity_docids, "replaced"); - } -} diff --git a/milli/src/update/prefix_word_pairs/prefix_word.rs b/milli/src/update/prefix_word_pairs/prefix_word.rs deleted file mode 100644 index 1ec66d010..000000000 --- a/milli/src/update/prefix_word_pairs/prefix_word.rs +++ /dev/null @@ -1,182 +0,0 @@ -use std::borrow::Cow; -use std::collections::{BTreeMap, HashSet}; - -use grenad::CompressionType; -use heed::types::ByteSlice; -use heed::BytesDecode; -use log::debug; - -use crate::update::index_documents::{create_writer, CursorClonableMmap}; -use crate::update::prefix_word_pairs::{ - insert_into_database, write_into_lmdb_database_without_merging, -}; -use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec}; - -#[allow(clippy::too_many_arguments)] -#[logging_timer::time] -pub fn index_prefix_word_database( - wtxn: &mut heed::RwTxn, - word_pair_proximity_docids: heed::Database, - prefix_word_pair_proximity_docids: heed::Database, - max_proximity: u8, - max_prefix_length: usize, - new_word_pair_proximity_docids: grenad::Reader, - new_prefix_fst_words: &[String], - common_prefix_fst_words: &[&[String]], - del_prefix_fst_words: &HashSet>, - chunk_compression_type: CompressionType, - chunk_compression_level: Option, -) -> Result<()> { - puffin::profile_function!(); - - let max_proximity = max_proximity - 1; - debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); - - let common_prefixes: Vec<_> = common_prefix_fst_words - .iter() - .flat_map(|s| s.iter()) - .map(|s| s.as_str()) - .filter(|s| s.len() <= max_prefix_length) - .collect(); - - for proximity in 1..max_proximity { - for prefix in common_prefixes.iter() { - let mut prefix_key = vec![proximity]; - prefix_key.extend_from_slice(prefix.as_bytes()); - let mut cursor = new_word_pair_proximity_docids.clone().into_prefix_iter(prefix_key)?; - // This is the core of the algorithm - execute_on_word_pairs_and_prefixes( - proximity, - prefix.as_bytes(), - // the next two arguments tell how to iterate over the new word pairs - &mut cursor, - |cursor| { - if let Some((key, value)) = cursor.next()? { - let (_, _, word2) = UncheckedU8StrStrCodec::bytes_decode(key) - .ok_or(heed::Error::Decoding)?; - Ok(Some((word2, value))) - } else { - Ok(None) - } - }, - // and this argument tells what to do with each new key (proximity, prefix, word2) and value (roaring bitmap) - |key, value| { - insert_into_database( - wtxn, - *prefix_word_pair_proximity_docids.as_polymorph(), - key, - value, - ) - }, - )?; - } - } - - // Now we do the same thing with the new prefixes and all word pairs in the DB - let new_prefixes: Vec<_> = new_prefix_fst_words - .iter() - .map(|s| s.as_str()) - .filter(|s| s.len() <= max_prefix_length) - .collect(); - - // Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity) - // element in an intermediary grenad - let mut writer = - create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?); - - for proximity in 1..max_proximity { - for prefix in new_prefixes.iter() { - let mut prefix_key = vec![proximity]; - prefix_key.extend_from_slice(prefix.as_bytes()); - let mut db_iter = word_pair_proximity_docids - .as_polymorph() - .prefix_iter::<_, ByteSlice, ByteSlice>(wtxn, prefix_key.as_slice())? - .remap_key_type::(); - execute_on_word_pairs_and_prefixes( - proximity, - prefix.as_bytes(), - &mut db_iter, - |db_iter| { - db_iter - .next() - .transpose() - .map(|x| x.map(|((_, _, word2), value)| (word2, value))) - .map_err(|e| e.into()) - }, - |key, value| writer.insert(key, value).map_err(|e| e.into()), - )?; - drop(db_iter); - } - } - - // and then we write the grenad into the DB - // Since the grenad contains only new prefixes, we know in advance that none - // of its elements already exist in the DB, thus there is no need to specify - // how to merge conflicting elements - write_into_lmdb_database_without_merging( - wtxn, - *prefix_word_pair_proximity_docids.as_polymorph(), - writer, - )?; - - // All of the word prefix pairs in the database that have a w2 - // that is contained in the `suppr_pw` set must be removed as well. - if !del_prefix_fst_words.is_empty() { - let mut iter = - prefix_word_pair_proximity_docids.remap_data_type::().iter_mut(wtxn)?; - while let Some(((_, prefix, _), _)) = iter.next().transpose()? { - if del_prefix_fst_words.contains(prefix.as_bytes()) { - // Delete this entry as the w2 prefix is no more in the words prefix fst. - unsafe { iter.del_current()? }; - } - } - } - - Ok(()) -} - -/// This is the core of the algorithm to initialise the Prefix Word Pair Proximity Docids database. -/// -/// Its arguments are: -/// - an iterator over the words following the given `prefix` with the given `proximity` -/// - a closure to describe how to handle the new computed (proximity, prefix, word2) elements -fn execute_on_word_pairs_and_prefixes( - proximity: u8, - prefix: &[u8], - iter: &mut I, - mut next_word2_and_docids: impl for<'a> FnMut(&'a mut I) -> Result>, - mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, -) -> Result<()> { - let mut batch: BTreeMap, Vec>> = BTreeMap::default(); - - // Memory usage check: - // The content of the loop will be called for each `word2` that follows a word beginning - // with `prefix` with the given proximity. - // In practice, I don't think the batch can ever get too big. - while let Some((word2, docids)) = next_word2_and_docids(iter)? { - let entry = batch.entry(word2.to_owned()).or_default(); - entry.push(Cow::Owned(docids.to_owned())); - } - - let mut key_buffer = Vec::with_capacity(512); - key_buffer.push(proximity); - key_buffer.extend_from_slice(prefix); - key_buffer.push(0); - - let mut value_buffer = Vec::with_capacity(65_536); - - for (word2, docids) in batch { - key_buffer.truncate(prefix.len() + 2); - value_buffer.clear(); - - key_buffer.extend_from_slice(&word2); - let data = if docids.len() > 1 { - CboRoaringBitmapCodec::merge_into(&docids, &mut value_buffer)?; - value_buffer.as_slice() - } else { - &docids[0] - }; - insert(key_buffer.as_slice(), data)?; - } - Ok(()) -} diff --git a/milli/src/update/prefix_word_pairs/word_prefix.rs b/milli/src/update/prefix_word_pairs/word_prefix.rs deleted file mode 100644 index 570adece9..000000000 --- a/milli/src/update/prefix_word_pairs/word_prefix.rs +++ /dev/null @@ -1,728 +0,0 @@ -/*! -The word-prefix-pair-proximity-docids database is a database whose keys are of -the form `(proximity, word, prefix)` and the values are roaring bitmaps of -the documents which contain `word` followed by another word starting with -`prefix` at a distance of `proximity`. - -The prefixes present in this database are only those that correspond to many -different words in the documents. - -## How is it created/updated? (simplified version) -To compute it, we have access to (mainly) two inputs: - -* a list of sorted prefixes, such as: -```text -c -ca -cat -d -do -dog -``` -Note that only prefixes which correspond to more than a certain number of -different words from the database are included in this list. - -* a sorted list of proximities and word pairs (the proximity is the distance between the two words), -associated with a roaring bitmap, such as: -```text -1 good doggo -> docids1: [8] -1 good door -> docids2: [7, 19, 20] -1 good ghost -> docids3: [1] -2 good dog -> docids4: [2, 5, 6] -2 horror cathedral -> docids5: [1, 2] -``` - -I illustrate a simplified version of the algorithm to create the word-prefix -pair-proximity database below: - -1. **Outer loop:** First, we iterate over each proximity and word pair: -```text -proximity: 1 -word1 : good -word2 : doggo -``` -2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are -in the list of sorted prefixes. And we insert the key `prefix` -and the value (`docids`) to a sorted map which we call the “batch”. For example, -at the end of the first outer loop, we may have: -```text -Outer loop 1: ------------------------------- -proximity: 1 -word1 : good -word2 : doggo -docids : docids1 - -prefixes: [d, do, dog] - -batch: [ - d, -> [docids1] - do -> [docids1] - dog -> [docids1] -] -``` -3. For illustration purpose, let's run through a second iteration of the outer loop: -```text -Outer loop 2: ------------------------------- -proximity: 1 -word1 : good -word2 : door -docids : docids2 - -prefixes: [d, do, doo] - -batch: [ - d -> [docids1, docids2] - do -> [docids1, docids2] - dog -> [docids1] - doo -> [docids2] -] -``` -Notice that there were some conflicts which were resolved by merging the -conflicting values together. Also, an additional prefix was added at the -end of the batch. - -4. On the third iteration of the outer loop, we have: -```text -Outer loop 3: ------------------------------- -proximity: 1 -word1 : good -word2 : ghost -``` -Because `word2` begins with a different letter than the previous `word2`, -we know that all the prefixes of `word2` are greater than the prefixes of the previous word2 - -Therefore, we know that we can insert every element from the batch into the -database before proceeding any further. This operation is called -“flushing the batch”. Flushing the batch should also be done whenever: -* `proximity` is different than the previous `proximity`. -* `word1` is different than the previous `word1`. -* `word2` starts with a different letter than the previous word2 - -6. **Flushing the batch:** to flush the batch, we iterate over its elements: -```text -Flushing Batch loop 1: ------------------------------- -proximity : 1 -word1 : good -prefix : d - -docids : [docids2, docids3] -``` -We then merge the array of `docids` (of type `Vec>`) using -`merge_cbo_roaring_bitmap` in order to get a single byte vector representing a -roaring bitmap of all the document ids where `word1` is followed by `prefix` -at a distance of `proximity`. -Once we have done that, we insert `(proximity, word1, prefix) -> merged_docids` -into the database. - -7. That's it! ... except... - -## How is it created/updated (continued) - -I lied a little bit about the input data. In reality, we get two sets of the -inputs described above, which come from different places: - -* For the list of sorted prefixes, we have: - 1. `new_prefixes`, which are all the prefixes that were not present in the - database before the insertion of the new documents - - 2. `common_prefixes` which are the prefixes that are present both in the - database and in the newly added documents - -* For the list of word pairs and proximities, we have: - 1. `new_word_pairs`, which is the list of word pairs and their proximities - present in the newly added documents - - 2. `word_pairs_db`, which is the list of word pairs from the database. - This list includes all elements in `new_word_pairs` since `new_word_pairs` - was added to the database prior to calling the `WordPrefix::execute` - function. - -To update the prefix database correctly, we call the algorithm described earlier first -on (`common_prefixes`, `new_word_pairs`) and then on (`new_prefixes`, `word_pairs_db`). -Thus: - -1. For all the word pairs that were already present in the DB, we insert them -again with the `new_prefixes`. Calling the algorithm on them with the -`common_prefixes` would not result in any new data. - -2. For all the new word pairs, we insert them twice: first with the `common_prefixes`, -and then, because they are part of `word_pairs_db`, with the `new_prefixes`. - -Note, also, that since we read data from the database when iterating over -`word_pairs_db`, we cannot insert the computed word-prefix-pair-proximity- -docids from the batch directly into the database (we would have a concurrent -reader and writer). Therefore, when calling the algorithm on -`(new_prefixes, word_pairs_db)`, we insert the computed -`((proximity, word, prefix), docids)` elements in an intermediary grenad -Writer instead of the DB. At the end of the outer loop, we finally read from -the grenad and insert its elements in the database. -*/ - -use std::borrow::Cow; -use std::collections::HashSet; - -use grenad::CompressionType; -use heed::types::ByteSlice; -use heed::BytesDecode; -use log::debug; - -use crate::update::index_documents::{create_writer, CursorClonableMmap}; -use crate::update::prefix_word_pairs::{ - insert_into_database, write_into_lmdb_database_without_merging, -}; -use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec}; - -#[allow(clippy::too_many_arguments)] -#[logging_timer::time] -pub fn index_word_prefix_database( - wtxn: &mut heed::RwTxn, - word_pair_proximity_docids: heed::Database, - word_prefix_pair_proximity_docids: heed::Database, - max_proximity: u8, - max_prefix_length: usize, - new_word_pair_proximity_docids: grenad::Reader, - new_prefix_fst_words: &[String], - common_prefix_fst_words: &[&[String]], - del_prefix_fst_words: &HashSet>, - chunk_compression_type: CompressionType, - chunk_compression_level: Option, -) -> Result<()> { - puffin::profile_function!(); - debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); - - // Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length - let prefixes = PrefixTrieNode::from_sorted_prefixes( - common_prefix_fst_words - .iter() - .flat_map(|s| s.iter()) - .map(|s| s.as_str()) - .filter(|s| s.len() <= max_prefix_length), - ); - - // If the prefix trie is not empty, then we can iterate over all new - // word pairs to look for new (proximity, word1, common_prefix) elements - // to insert in the DB - if !prefixes.is_empty() { - let mut cursor = new_word_pair_proximity_docids.into_cursor()?; - // This is the core of the algorithm - execute_on_word_pairs_and_prefixes( - // the first two arguments tell how to iterate over the new word pairs - &mut cursor, - |cursor| { - if let Some((key, value)) = cursor.move_on_next()? { - let (proximity, word1, word2) = - UncheckedU8StrStrCodec::bytes_decode(key).ok_or(heed::Error::Decoding)?; - Ok(Some(((proximity, word1, word2), value))) - } else { - Ok(None) - } - }, - &prefixes, - max_proximity, - // and this argument tells what to do with each new key (proximity, word1, prefix) and value (roaring bitmap) - |key, value| { - insert_into_database( - wtxn, - *word_prefix_pair_proximity_docids.as_polymorph(), - key, - value, - ) - }, - )?; - } - - // Now we do the same thing with the new prefixes and all word pairs in the DB - - let prefixes = PrefixTrieNode::from_sorted_prefixes( - new_prefix_fst_words.iter().map(|s| s.as_str()).filter(|s| s.len() <= max_prefix_length), - ); - - if !prefixes.is_empty() { - let mut db_iter = word_pair_proximity_docids - .remap_key_type::() - .remap_data_type::() - .iter(wtxn)?; - - // Since we read the DB, we can't write to it directly, so we add each new (proximity, word1, prefix) - // element in an intermediary grenad - let mut writer = - create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?); - - execute_on_word_pairs_and_prefixes( - &mut db_iter, - |db_iter| db_iter.next().transpose().map_err(|e| e.into()), - &prefixes, - max_proximity, - |key, value| writer.insert(key, value).map_err(|e| e.into()), - )?; - drop(db_iter); - - // and then we write the grenad into the DB - // Since the grenad contains only new prefixes, we know in advance that none - // of its elements already exist in the DB, thus there is no need to specify - // how to merge conflicting elements - write_into_lmdb_database_without_merging( - wtxn, - *word_prefix_pair_proximity_docids.as_polymorph(), - writer, - )?; - } - - // All of the word prefix pairs in the database that have a w2 - // that is contained in the `suppr_pw` set must be removed as well. - if !del_prefix_fst_words.is_empty() { - let mut iter = - word_prefix_pair_proximity_docids.remap_data_type::().iter_mut(wtxn)?; - while let Some(((_, _, prefix), _)) = iter.next().transpose()? { - if del_prefix_fst_words.contains(prefix.as_bytes()) { - // Delete this entry as the w2 prefix is no more in the words prefix fst. - unsafe { iter.del_current()? }; - } - } - } - - Ok(()) -} - -/// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database. -/// -/// Its main arguments are: -/// 1. a sorted iterator over ((proximity, word1, word2), docids) elements -/// 2. a prefix trie -/// 3. a closure to describe how to handle the new computed (proximity, word1, prefix) elements -/// -/// For more information about what this function does, read the module documentation. -fn execute_on_word_pairs_and_prefixes( - iter: &mut I, - mut next_word_pair_proximity: impl for<'a> FnMut( - &'a mut I, - ) -> Result< - Option<((u8, &'a [u8], &'a [u8]), &'a [u8])>, - >, - prefixes: &PrefixTrieNode, - max_proximity: u8, - mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, -) -> Result<()> { - let mut batch = PrefixAndProximityBatch::default(); - let mut prev_word2_start = 0; - - // Optimisation: the index at the root of the prefix trie where to search for - let mut prefix_search_start = PrefixTrieNodeSearchStart(0); - - // Optimisation: true if there are no potential prefixes for the current word2 based on its first letter - let mut empty_prefixes = false; - - let mut prefix_buffer = Vec::with_capacity(8); - let mut merge_buffer = Vec::with_capacity(65_536); - - while let Some(((proximity, word1, word2), data)) = next_word_pair_proximity(iter)? { - // stop indexing if the proximity is over the threshold - if proximity > max_proximity { - break; - }; - let word2_start_different_than_prev = word2[0] != prev_word2_start; - // if there were no potential prefixes for the previous word2 based on its first letter, - // and if the current word2 starts with the same letter, then there is also no potential - // prefixes for the current word2, and we can skip to the next iteration - if empty_prefixes && !word2_start_different_than_prev { - continue; - } - - // if the proximity is different to the previous one, OR - // if word1 is different than the previous word1, OR - // if the start of word2 is different than the previous start of word2, - // THEN we'll need to flush the batch - let prox_different_than_prev = proximity != batch.proximity; - let word1_different_than_prev = word1 != batch.word1; - if prox_different_than_prev || word1_different_than_prev || word2_start_different_than_prev - { - batch.flush(&mut merge_buffer, &mut insert)?; - batch.proximity = proximity; - // don't forget to reset the value of batch.word1 and prev_word2_start - if word1_different_than_prev { - batch.word1.clear(); - batch.word1.extend_from_slice(word1); - } - if word2_start_different_than_prev { - prev_word2_start = word2[0]; - } - prefix_search_start.0 = 0; - // Optimisation: find the search start in the prefix trie to iterate over the prefixes of word2 - empty_prefixes = !prefixes.set_search_start(word2, &mut prefix_search_start); - } - - if !empty_prefixes { - // All conditions are satisfied, we can now insert each new prefix of word2 into the batch - prefix_buffer.clear(); - prefixes.for_each_prefix_of( - word2, - &mut prefix_buffer, - &prefix_search_start, - |prefix_buffer| { - batch.insert(prefix_buffer, data.to_vec()); - }, - ); - } - } - batch.flush(&mut merge_buffer, &mut insert)?; - Ok(()) -} -/** -A map structure whose keys are prefixes and whose values are vectors of bitstrings (serialized roaring bitmaps). -The keys are sorted and conflicts are resolved by merging the vectors of bitstrings together. - -It is used to ensure that all ((proximity, word1, prefix), docids) are inserted into the database in sorted order and efficiently. - -The batch is flushed as often as possible, when we are sure that every (proximity, word1, prefix) key derived from its content -can be inserted into the database in sorted order. When it is flushed, it calls a user-provided closure with the following arguments: -- key : (proximity, word1, prefix) as bytes -- value : merged roaring bitmaps from all values associated with prefix in the batch, serialised to bytes -*/ -#[derive(Default)] -struct PrefixAndProximityBatch { - proximity: u8, - word1: Vec, - #[allow(clippy::type_complexity)] - batch: Vec<(Vec, Vec>)>, -} - -impl PrefixAndProximityBatch { - /// Insert the new key and value into the batch - /// - /// The key must either exist in the batch or be greater than all existing keys - fn insert(&mut self, new_key: &[u8], new_value: Vec) { - match self.batch.iter_mut().find(|el| el.0 == new_key) { - Some((_prefix, docids)) => docids.push(Cow::Owned(new_value)), - None => self.batch.push((new_key.to_vec(), vec![Cow::Owned(new_value)])), - } - } - - /// Empties the batch, calling `insert` on each element. - /// - /// The key given to `insert` is `(proximity, word1, prefix)` and the value is the associated merged roaring bitmap. - fn flush( - &mut self, - merge_buffer: &mut Vec, - insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>, - ) -> Result<()> { - let PrefixAndProximityBatch { proximity, word1, batch } = self; - if batch.is_empty() { - return Ok(()); - } - merge_buffer.clear(); - - let mut buffer = Vec::with_capacity(word1.len() + 1 + 6); - buffer.push(*proximity); - buffer.extend_from_slice(word1); - buffer.push(0); - - for (key, mergeable_data) in batch.drain(..) { - buffer.truncate(1 + word1.len() + 1); - buffer.extend_from_slice(key.as_slice()); - - let data = if mergeable_data.len() > 1 { - CboRoaringBitmapCodec::merge_into(&mergeable_data, merge_buffer)?; - merge_buffer.as_slice() - } else { - &mergeable_data[0] - }; - insert(buffer.as_slice(), data)?; - merge_buffer.clear(); - } - - Ok(()) - } -} - -/** A prefix trie. Used to iterate quickly over the prefixes of a word that are -within a set. - -## Structure -The trie is made of nodes composed of: -1. a byte character (e.g. 'a') -2. whether the node is an end node or not -3. a list of children nodes, sorted by their byte character - -For example, the trie that stores the strings `[ac, ae, ar, ch, cei, cel, ch, r, rel, ri]` -is drawn below. Nodes with a double border are "end nodes". - -┌──────────────────────┐ ┌──────────────────────┐ ╔══════════════════════╗ -│ a │ │ c │ ║ r ║ -└──────────────────────┘ └──────────────────────┘ ╚══════════════════════╝ -╔══════╗╔══════╗╔══════╗ ┌─────────┐ ╔═════════╗ ┌─────────┐ ╔══════════╗ -║ c ║║ e ║║ r ║ │ e │ ║ h ║ │ e │ ║ i ║ -╚══════╝╚══════╝╚══════╝ └─────────┘ ╚═════════╝ └─────────┘ ╚══════════╝ - ╔═══╗ ╔═══╗ ╔═══╗ - ║ i ║ ║ l ║ ║ l ║ - ╚═══╝ ╚═══╝ ╚═══╝ -*/ -#[derive(Default, Debug)] -struct PrefixTrieNode { - children: Vec<(PrefixTrieNode, u8)>, - is_end_node: bool, -} - -#[derive(Debug)] -struct PrefixTrieNodeSearchStart(usize); - -impl PrefixTrieNode { - fn is_empty(&self) -> bool { - self.children.is_empty() - } - - /// Returns false if the trie does not contain a prefix of the given word. - /// Returns true if the trie *may* contain a prefix of the given word. - /// - /// Moves the search start to the first node equal to the first letter of the word, - /// or to 0 otherwise. - fn set_search_start(&self, word: &[u8], search_start: &mut PrefixTrieNodeSearchStart) -> bool { - let byte = word[0]; - if self.children[search_start.0].1 == byte { - true - } else { - match self.children[search_start.0..].binary_search_by_key(&byte, |x| x.1) { - Ok(position) => { - search_start.0 += position; - true - } - Err(_) => { - search_start.0 = 0; - false - } - } - } - } - - fn from_sorted_prefixes<'a>(prefixes: impl Iterator) -> Self { - let mut node = PrefixTrieNode::default(); - for prefix in prefixes { - node.insert_sorted_prefix(prefix.as_bytes().iter()); - } - node - } - fn insert_sorted_prefix(&mut self, mut prefix: std::slice::Iter) { - if let Some(&c) = prefix.next() { - if let Some((node, byte)) = self.children.last_mut() { - if *byte == c { - node.insert_sorted_prefix(prefix); - return; - } - } - let mut new_node = PrefixTrieNode::default(); - new_node.insert_sorted_prefix(prefix); - self.children.push((new_node, c)); - } else { - self.is_end_node = true; - } - } - - /// Call the given closure on each prefix of the word contained in the prefix trie. - /// - /// The search starts from the given `search_start`. - fn for_each_prefix_of( - &self, - word: &[u8], - buffer: &mut Vec, - search_start: &PrefixTrieNodeSearchStart, - mut do_fn: impl FnMut(&mut Vec), - ) { - let first_byte = word[0]; - let mut cur_node = self; - buffer.push(first_byte); - if let Some((child_node, c)) = - cur_node.children[search_start.0..].iter().find(|(_, c)| *c >= first_byte) - { - if *c == first_byte { - cur_node = child_node; - if cur_node.is_end_node { - do_fn(buffer); - } - for &byte in &word[1..] { - buffer.push(byte); - if let Some((child_node, c)) = - cur_node.children.iter().find(|(_, c)| *c >= byte) - { - if *c == byte { - cur_node = child_node; - if cur_node.is_end_node { - do_fn(buffer); - } - } else { - break; - } - } else { - break; - } - } - } - } - } -} -#[cfg(test)] -mod tests { - use roaring::RoaringBitmap; - - use super::*; - use crate::{CboRoaringBitmapCodec, U8StrStrCodec}; - - fn check_prefixes( - trie: &PrefixTrieNode, - search_start: &PrefixTrieNodeSearchStart, - word: &str, - expected_prefixes: &[&str], - ) { - let mut actual_prefixes = vec![]; - trie.for_each_prefix_of(word.as_bytes(), &mut Vec::new(), search_start, |x| { - let s = String::from_utf8(x.to_owned()).unwrap(); - actual_prefixes.push(s); - }); - assert_eq!(actual_prefixes, expected_prefixes); - } - - #[test] - fn test_trie() { - let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ - "1", "19", "2", "a", "ab", "ac", "ad", "al", "am", "an", "ap", "ar", "as", "at", "au", - "b", "ba", "bar", "be", "bi", "bl", "bla", "bo", "br", "bra", "bri", "bro", "bu", "c", - "ca", "car", "ce", "ch", "cha", "che", "chi", "ci", "cl", "cla", "co", "col", "com", - "comp", "con", "cons", "cont", "cor", "cou", "cr", "cu", "d", "da", "de", "dec", "des", - "di", "dis", "do", "dr", "du", "e", "el", "em", "en", "es", "ev", "ex", "exp", "f", - "fa", "fe", "fi", "fl", "fo", "for", "fr", "fra", "fre", "fu", "g", "ga", "ge", "gi", - "gl", "go", "gr", "gra", "gu", "h", "ha", "har", "he", "hea", "hi", "ho", "hu", "i", - "im", "imp", "in", "ind", "ins", "int", "inte", "j", "ja", "je", "jo", "ju", "k", "ka", - "ke", "ki", "ko", "l", "la", "le", "li", "lo", "lu", "m", "ma", "mal", "man", "mar", - "mat", "mc", "me", "mi", "min", "mis", "mo", "mon", "mor", "mu", "n", "na", "ne", "ni", - "no", "o", "or", "ou", "ov", "ove", "over", "p", "pa", "par", "pe", "per", "ph", "pi", - "pl", "po", "pr", "pre", "pro", "pu", "q", "qu", "r", "ra", "re", "rec", "rep", "res", - "ri", "ro", "ru", "s", "sa", "san", "sc", "sch", "se", "sh", "sha", "shi", "sho", "si", - "sk", "sl", "sn", "so", "sp", "st", "sta", "ste", "sto", "str", "su", "sup", "sw", "t", - "ta", "te", "th", "ti", "to", "tr", "tra", "tri", "tu", "u", "un", "v", "va", "ve", - "vi", "vo", "w", "wa", "we", "wh", "wi", "wo", "y", "yo", "z", - ])); - - let mut search_start = PrefixTrieNodeSearchStart(0); - - let is_empty = !trie.set_search_start("affair".as_bytes(), &mut search_start); - assert!(!is_empty); - assert_eq!(search_start.0, 2); - - check_prefixes(&trie, &search_start, "affair", &["a"]); - check_prefixes(&trie, &search_start, "shampoo", &["s", "sh", "sha"]); - - let is_empty = !trie.set_search_start("unique".as_bytes(), &mut search_start); - assert!(!is_empty); - assert_eq!(trie.children[search_start.0].1, b'u'); - - check_prefixes(&trie, &search_start, "unique", &["u", "un"]); - - // NOTE: this should fail, because the search start is already beyong 'a' - let is_empty = trie.set_search_start("abba".as_bytes(), &mut search_start); - assert!(!is_empty); - // search start is reset - assert_eq!(search_start.0, 0); - - let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ - "arb", "arbre", "cat", "catto", - ])); - check_prefixes(&trie, &search_start, "arbres", &["arb", "arbre"]); - check_prefixes(&trie, &search_start, "cattos", &["cat", "catto"]); - } - - #[test] - fn test_execute_on_word_pairs_and_prefixes() { - let prefixes = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ - "arb", "arbre", "cat", "catto", - ])); - - let mut serialised_bitmap123 = vec![]; - let mut bitmap123 = RoaringBitmap::new(); - bitmap123.insert(1); - bitmap123.insert(2); - bitmap123.insert(3); - CboRoaringBitmapCodec::serialize_into(&bitmap123, &mut serialised_bitmap123); - - let mut serialised_bitmap456 = vec![]; - let mut bitmap456 = RoaringBitmap::new(); - bitmap456.insert(4); - bitmap456.insert(5); - bitmap456.insert(6); - CboRoaringBitmapCodec::serialize_into(&bitmap456, &mut serialised_bitmap456); - - let mut serialised_bitmap789 = vec![]; - let mut bitmap789 = RoaringBitmap::new(); - bitmap789.insert(7); - bitmap789.insert(8); - bitmap789.insert(9); - CboRoaringBitmapCodec::serialize_into(&bitmap789, &mut serialised_bitmap789); - - let mut serialised_bitmap_ranges = vec![]; - let mut bitmap_ranges = RoaringBitmap::new(); - bitmap_ranges.insert_range(63_000..65_000); - bitmap_ranges.insert_range(123_000..128_000); - CboRoaringBitmapCodec::serialize_into(&bitmap_ranges, &mut serialised_bitmap_ranges); - - let word_pairs = [ - ((1, "healthy", "arbres"), &serialised_bitmap123), - ((1, "healthy", "boat"), &serialised_bitmap123), - ((1, "healthy", "ca"), &serialised_bitmap123), - ((1, "healthy", "cats"), &serialised_bitmap456), - ((1, "healthy", "cattos"), &serialised_bitmap123), - ((1, "jittery", "cat"), &serialised_bitmap123), - ((1, "jittery", "cata"), &serialised_bitmap456), - ((1, "jittery", "catb"), &serialised_bitmap789), - ((1, "jittery", "catc"), &serialised_bitmap_ranges), - ((2, "healthy", "arbre"), &serialised_bitmap123), - ((2, "healthy", "arbres"), &serialised_bitmap456), - ((2, "healthy", "cats"), &serialised_bitmap789), - ((2, "healthy", "cattos"), &serialised_bitmap_ranges), - ((3, "healthy", "arbre"), &serialised_bitmap456), - ((3, "healthy", "arbres"), &serialised_bitmap789), - ]; - - let expected_result = [ - ((1, "healthy", "arb"), bitmap123.clone()), - ((1, "healthy", "arbre"), bitmap123.clone()), - ((1, "healthy", "cat"), &bitmap456 | &bitmap123), - ((1, "healthy", "catto"), bitmap123.clone()), - ((1, "jittery", "cat"), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)), - ((2, "healthy", "arb"), &bitmap123 | &bitmap456), - ((2, "healthy", "arbre"), &bitmap123 | &bitmap456), - ((2, "healthy", "cat"), &bitmap789 | &bitmap_ranges), - ((2, "healthy", "catto"), bitmap_ranges.clone()), - ]; - - let mut result = vec![]; - - let mut iter = - IntoIterator::into_iter(word_pairs).map(|((proximity, word1, word2), data)| { - ((proximity, word1.as_bytes(), word2.as_bytes()), data.as_slice()) - }); - execute_on_word_pairs_and_prefixes( - &mut iter, - |iter| Ok(iter.next()), - &prefixes, - 2, - |k, v| { - let (proximity, word1, prefix) = U8StrStrCodec::bytes_decode(k).unwrap(); - let bitmap = CboRoaringBitmapCodec::bytes_decode(v).unwrap(); - result.push(((proximity.to_owned(), word1.to_owned(), prefix.to_owned()), bitmap)); - Ok(()) - }, - ) - .unwrap(); - - for (x, y) in result.into_iter().zip(IntoIterator::into_iter(expected_result)) { - let ((actual_proximity, actual_word1, actual_prefix), actual_bitmap) = x; - let ((expected_proximity, expected_word1, expected_prefix), expected_bitmap) = y; - - assert_eq!(actual_word1, expected_word1); - assert_eq!(actual_prefix, expected_prefix); - assert_eq!(actual_proximity, expected_proximity); - assert_eq!(actual_bitmap, expected_bitmap); - } - } -} From 70ce40828c3d58a667a314a063d1c7c5b5a05645 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 8 Nov 2023 16:41:26 +0100 Subject: [PATCH 110/127] Compute word docids prefix cache --- milli/src/update/del_add.rs | 14 +++++ .../index_documents/helpers/grenad_helpers.rs | 44 +++++++++++++++ .../helpers/merge_functions.rs | 16 ++++++ .../src/update/index_documents/helpers/mod.rs | 7 ++- milli/src/update/index_documents/mod.rs | 6 +- .../src/update/index_documents/typed_chunk.rs | 55 +++++-------------- milli/src/update/word_prefix_docids.rs | 27 ++++++--- 7 files changed, 116 insertions(+), 53 deletions(-) diff --git a/milli/src/update/del_add.rs b/milli/src/update/del_add.rs index c8b7f0f6a..dc7c0409a 100644 --- a/milli/src/update/del_add.rs +++ b/milli/src/update/del_add.rs @@ -102,3 +102,17 @@ pub fn del_add_from_two_obkvs( pub fn is_noop_del_add_obkv(del_add: KvReaderDelAdd) -> bool { del_add.get(DelAdd::Deletion) == del_add.get(DelAdd::Addition) } + +/// A function that extracts and returns the Add side of a DelAdd obkv. +/// This is useful when there are no previous value in the database and +/// therefore we don't need to do a diff with what's already there. +/// +/// If there is no Add side we currently write an empty buffer +/// which is a valid CboRoaringBitmap. +#[allow(clippy::ptr_arg)] // required to avoid signature mismatch +pub fn deladd_serialize_add_side<'a>( + obkv: &'a [u8], + _buffer: &mut Vec, +) -> crate::Result<&'a [u8]> { + Ok(KvReaderDelAdd::new(obkv).get(DelAdd::Addition).unwrap_or_default()) +} diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index 4f764ab95..f520ea7b0 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -9,6 +9,7 @@ use log::debug; use super::{ClonableMmap, MergeFn}; use crate::error::InternalError; +use crate::update::index_documents::valid_lmdb_key; use crate::Result; pub type CursorClonableMmap = io::Cursor; @@ -282,6 +283,49 @@ pub fn sorter_into_lmdb_database( Ok(()) } +/// Write provided sorter in database using serialize_value function. +/// merge_values function is used if an entry already exist in the database. +pub fn write_sorter_into_database( + sorter: Sorter, + database: &heed::Database, + wtxn: &mut heed::RwTxn, + index_is_empty: bool, + serialize_value: FS, + merge_values: FM, +) -> Result<()> +where + FS: for<'a> Fn(&'a [u8], &'a mut Vec) -> Result<&'a [u8]>, + FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec) -> Result>, +{ + puffin::profile_function!(); + + let mut buffer = Vec::new(); + let database = database.remap_types::(); + + let mut merger_iter = sorter.into_stream_merger_iter()?; + while let Some((key, value)) = merger_iter.next()? { + if valid_lmdb_key(key) { + buffer.clear(); + let value = if index_is_empty { + Some(serialize_value(value, &mut buffer)?) + } else { + match database.get(wtxn, key)? { + Some(prev_value) => merge_values(value, prev_value, &mut buffer)?, + None => Some(serialize_value(value, &mut buffer)?), + } + }; + match value { + Some(value) => database.put(wtxn, key, value)?, + None => { + database.delete(wtxn, key)?; + } + } + } + } + + Ok(()) +} + /// Used when trying to merge readers, but you don't actually care about the values. pub fn merge_ignore_values<'a>(_key: &[u8], _values: &[Cow<'a, [u8]>]) -> Result> { Ok(Cow::Owned(Vec::new())) diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs index 98c1c1a04..5d9ca7ef2 100644 --- a/milli/src/update/index_documents/helpers/merge_functions.rs +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -239,3 +239,19 @@ pub fn merge_deladd_cbo_roaring_bitmaps<'a>( output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into) } } + +/// A function that merges a DelAdd of bitmao into an already existing bitmap. +/// +/// The first argument is the DelAdd obkv of CboRoaringBitmaps and +/// the second one is the CboRoaringBitmap to merge into. +pub fn merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap<'a>( + deladd_obkv: &[u8], + previous: &[u8], + buffer: &'a mut Vec, +) -> Result> { + Ok(CboRoaringBitmapCodec::merge_deladd_into( + KvReaderDelAdd::new(deladd_obkv), + previous, + buffer, + )?) +} diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index 1f2f8e6ef..c167f1cd3 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -9,12 +9,13 @@ pub use clonable_mmap::{ClonableMmap, CursorClonableMmap}; use fst::{IntoStreamer, Streamer}; pub use grenad_helpers::{ as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, - merge_ignore_values, sorter_into_lmdb_database, sorter_into_reader, writer_into_reader, - GrenadParameters, MergeableReader, + merge_ignore_values, sorter_into_lmdb_database, sorter_into_reader, write_sorter_into_database, + writer_into_reader, GrenadParameters, MergeableReader, }; pub use merge_functions::{ concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string, - merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps, merge_roaring_bitmaps, + merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_roaring_bitmaps, obkvs_keep_last_addition_merge_deletions, obkvs_merge_additions_and_deletions, serialize_roaring_bitmap, MergeFn, }; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 8552cf52b..5dbb4dd0b 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -26,8 +26,10 @@ pub use self::enrich::{ }; pub use self::helpers::{ as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, - fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, - sorter_into_lmdb_database, valid_lmdb_key, writer_into_reader, ClonableMmap, MergeFn, + fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, + merge_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, write_sorter_into_database, + writer_into_reader, ClonableMmap, MergeFn, }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index b53d859cd..90f9b7739 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -13,7 +13,10 @@ use obkv::{KvReader, KvWriter}; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; -use super::helpers::{self, merge_ignore_values, valid_lmdb_key, CursorClonableMmap}; +use super::helpers::{ + self, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_ignore_values, + valid_lmdb_key, CursorClonableMmap, +}; use super::{ClonableMmap, MergeFn}; use crate::distance::NDotProductPoint; use crate::error::UserError; @@ -21,12 +24,11 @@ use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind}; use crate::facet::FacetType; use crate::index::db_name::DOCUMENTS; use crate::index::Hnsw; -use crate::update::del_add::{DelAdd, KvReaderDelAdd}; +use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd}; use crate::update::facet::FacetsUpdate; use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at}; use crate::{ - lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, Result, - SerializationError, BEU32, + lat_lng_to_xyz, DocumentId, FieldId, GeoPoint, Index, Result, SerializationError, BEU32, }; pub(crate) enum TypedChunk { @@ -186,7 +188,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_deladd_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } @@ -202,7 +204,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_deladd_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?; @@ -212,7 +214,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_deladd_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?; @@ -222,7 +224,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_deladd_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; // create fst from word docids @@ -244,7 +246,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_deladd_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } @@ -265,7 +267,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_deladd_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } @@ -276,7 +278,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_deladd_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } @@ -287,7 +289,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_deladd_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } @@ -298,7 +300,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_deladd_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } @@ -495,33 +497,6 @@ fn merge_word_docids_reader_into_fst( Ok(builder.into_set()) } -/// A function that extracts and returns the Add side of a DelAdd obkv. -/// This is useful when there are no previous value in the database and -/// therefore we don't need to do a diff with what's already there. -/// -/// If there is no Add side we currently write an empty buffer -/// which is a valid CboRoaringBitmap. -#[allow(clippy::ptr_arg)] // required to avoid signature mismatch -fn deladd_serialize_add_side<'a>(obkv: &'a [u8], _buffer: &mut Vec) -> Result<&'a [u8]> { - Ok(KvReaderDelAdd::new(obkv).get(DelAdd::Addition).unwrap_or_default()) -} - -/// A function that merges a DelAdd of bitmao into an already existing bitmap. -/// -/// The first argument is the DelAdd obkv of CboRoaringBitmaps and -/// the second one is the CboRoaringBitmap to merge into. -fn merge_deladd_cbo_roaring_bitmaps<'a>( - deladd_obkv: &[u8], - previous: &[u8], - buffer: &'a mut Vec, -) -> Result> { - Ok(CboRoaringBitmapCodec::merge_deladd_into( - KvReaderDelAdd::new(deladd_obkv), - previous, - buffer, - )?) -} - /// Write provided entries in database using serialize_value function. /// merge_values function is used if an entry already exist in the database. fn write_entries_into_database( diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index 8220aa777..618f451dc 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -4,9 +4,11 @@ use grenad::CompressionType; use heed::types::{ByteSlice, Str}; use heed::Database; +use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvWriterDelAdd}; use crate::update::index_documents::{ - create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, - CursorClonableMmap, MergeFn, + create_sorter, merge_deladd_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key, + write_sorter_into_database, CursorClonableMmap, MergeFn, }; use crate::{CboRoaringBitmapCodec, Result}; @@ -51,7 +53,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { // and write into it at the same time, therefore we write into another file. let mut prefix_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, self.chunk_compression_type, self.chunk_compression_level, self.max_nb_chunks, @@ -92,11 +94,16 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { // We fetch the docids associated to the newly added word prefix fst only. let db = self.word_docids.remap_data_type::(); + let mut buffer = Vec::new(); for prefix in new_prefix_fst_words { let prefix = std::str::from_utf8(prefix.as_bytes())?; for result in db.prefix_iter(self.wtxn, prefix)? { let (_word, data) = result?; - prefix_docids_sorter.insert(prefix, data)?; + buffer.clear(); + let mut writer = KvWriterDelAdd::new(&mut buffer); + writer.insert(DelAdd::Addition, data)?; + + prefix_docids_sorter.insert(prefix, writer.into_inner()?)?; } } @@ -110,12 +117,16 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { drop(iter); + let database_is_empty = self.word_prefix_docids.is_empty(self.wtxn)?; + // We finally write the word prefix docids into the LMDB database. - sorter_into_lmdb_database( - self.wtxn, - *self.word_prefix_docids.as_polymorph(), + write_sorter_into_database( prefix_docids_sorter, - merge_cbo_roaring_bitmaps, + &self.word_prefix_docids, + self.wtxn, + database_is_empty, + deladd_serialize_add_side, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; Ok(()) From 5a9c96e1db0b2ec1de77c0c01b76676072aec754 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 9 Nov 2023 11:34:26 +0100 Subject: [PATCH 111/127] Compute word integer prefix cache --- .../src/update/words_prefix_integer_docids.rs | 28 +++++++++++++------ 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/milli/src/update/words_prefix_integer_docids.rs b/milli/src/update/words_prefix_integer_docids.rs index c65438928..e083f510a 100644 --- a/milli/src/update/words_prefix_integer_docids.rs +++ b/milli/src/update/words_prefix_integer_docids.rs @@ -9,9 +9,11 @@ use log::debug; use crate::error::SerializationError; use crate::heed_codec::StrBEU16Codec; use crate::index::main_key::WORDS_PREFIXES_FST_KEY; +use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvWriterDelAdd}; use crate::update::index_documents::{ - create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, - CursorClonableMmap, MergeFn, + create_sorter, merge_deladd_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key, + write_sorter_into_database, CursorClonableMmap, MergeFn, }; use crate::{CboRoaringBitmapCodec, Result}; @@ -55,7 +57,7 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> { let mut prefix_integer_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, self.chunk_compression_type, self.chunk_compression_level, self.max_nb_chunks, @@ -108,6 +110,7 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> { // We fetch the docids associated to the newly added word prefix fst only. let db = self.word_database.remap_data_type::(); + let mut buffer = Vec::new(); for prefix_bytes in new_prefix_fst_words { let prefix = str::from_utf8(prefix_bytes.as_bytes()).map_err(|_| { SerializationError::Decoding { db_name: Some(WORDS_PREFIXES_FST_KEY) } @@ -123,7 +126,11 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> { if word.starts_with(prefix) { let key = (prefix, pos); let bytes = StrBEU16Codec::bytes_encode(&key).unwrap(); - prefix_integer_docids_sorter.insert(bytes, data)?; + + buffer.clear(); + let mut writer = KvWriterDelAdd::new(&mut buffer); + writer.insert(DelAdd::Addition, data)?; + prefix_integer_docids_sorter.insert(bytes, writer.into_inner()?)?; } } } @@ -143,12 +150,16 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> { drop(iter); } + let database_is_empty = self.prefix_database.is_empty(self.wtxn)?; + // We finally write all the word prefix integer docids into the LMDB database. - sorter_into_lmdb_database( - self.wtxn, - *self.prefix_database.as_polymorph(), + write_sorter_into_database( prefix_integer_docids_sorter, - merge_cbo_roaring_bitmaps, + &self.prefix_database, + self.wtxn, + database_is_empty, + deladd_serialize_add_side, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; Ok(()) @@ -159,6 +170,7 @@ fn write_prefixes_in_sorter( prefixes: &mut HashMap, Vec>>, sorter: &mut grenad::Sorter, ) -> Result<()> { + // TODO: Merge before insertion. for (key, data_slices) in prefixes.drain() { for data in data_slices { if valid_lmdb_key(&key) { From 882ab9cc857fde9394b9fc4f1d46599617b8ccd7 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 9 Nov 2023 11:35:33 +0100 Subject: [PATCH 112/127] remove warnings --- .../index_documents/helpers/grenad_helpers.rs | 45 ------------------- .../src/update/index_documents/helpers/mod.rs | 4 +- milli/src/update/index_documents/mod.rs | 4 +- 3 files changed, 4 insertions(+), 49 deletions(-) diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index f520ea7b0..061cbe5a0 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -1,14 +1,11 @@ use std::borrow::Cow; use std::fs::File; use std::io::{self, BufReader, BufWriter, Seek}; -use std::time::Instant; use grenad::{CompressionType, Sorter}; use heed::types::ByteSlice; -use log::debug; use super::{ClonableMmap, MergeFn}; -use crate::error::InternalError; use crate::update::index_documents::valid_lmdb_key; use crate::Result; @@ -241,48 +238,6 @@ pub fn grenad_obkv_into_chunks( Ok(std::iter::from_fn(move || transposer().transpose())) } -pub fn sorter_into_lmdb_database( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, - sorter: Sorter, - merge: MergeFn, -) -> Result<()> { - puffin::profile_function!(); - debug!("Writing MTBL sorter..."); - let before = Instant::now(); - - let mut merger_iter = sorter.into_stream_merger_iter()?; - if database.is_empty(wtxn)? { - let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; - while let Some((k, v)) = merger_iter.next()? { - // safety: we don't keep references from inside the LMDB database. - unsafe { out_iter.append(k, v)? }; - } - } else { - while let Some((k, v)) = merger_iter.next()? { - let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; - match iter.next().transpose()? { - Some((key, old_val)) if key == k => { - let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)]; - let val = merge(k, &vals).map_err(|_| { - // TODO just wrap this error? - InternalError::IndexingMergingKeys { process: "get-put-merge" } - })?; - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(k, &val)? }; - } - _ => { - drop(iter); - database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; - } - } - } - } - - debug!("MTBL sorter writen in {:.02?}!", before.elapsed()); - Ok(()) -} - /// Write provided sorter in database using serialize_value function. /// merge_values function is used if an entry already exist in the database. pub fn write_sorter_into_database( diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index c167f1cd3..841c09543 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -9,8 +9,8 @@ pub use clonable_mmap::{ClonableMmap, CursorClonableMmap}; use fst::{IntoStreamer, Streamer}; pub use grenad_helpers::{ as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, - merge_ignore_values, sorter_into_lmdb_database, sorter_into_reader, write_sorter_into_database, - writer_into_reader, GrenadParameters, MergeableReader, + merge_ignore_values, sorter_into_reader, write_sorter_into_database, writer_into_reader, + GrenadParameters, MergeableReader, }; pub use merge_functions::{ concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 5dbb4dd0b..de0361936 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -28,8 +28,8 @@ pub use self::helpers::{ as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, - merge_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, write_sorter_into_database, - writer_into_reader, ClonableMmap, MergeFn, + merge_roaring_bitmaps, valid_lmdb_key, write_sorter_into_database, writer_into_reader, + ClonableMmap, MergeFn, }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; From db2fb86b8bbb69cb79781d74dda885460ea45560 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 9 Nov 2023 14:19:16 +0100 Subject: [PATCH 113/127] Extract PrimaryKey logic to a type --- milli/src/documents/mod.rs | 10 ++ milli/src/documents/primary_key.rs | 168 +++++++++++++++++++++++++++++ milli/src/fields_ids_map.rs | 6 ++ 3 files changed, 184 insertions(+) create mode 100644 milli/src/documents/primary_key.rs diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index 7c037b3bf..4429f083d 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -1,5 +1,6 @@ mod builder; mod enriched; +mod primary_key; mod reader; mod serde_impl; @@ -11,6 +12,9 @@ use bimap::BiHashMap; pub use builder::DocumentsBatchBuilder; pub use enriched::{EnrichedDocument, EnrichedDocumentsBatchCursor, EnrichedDocumentsBatchReader}; use obkv::KvReader; +pub use primary_key::{ + DocumentIdExtractionError, FieldDistribution, PrimaryKey, DEFAULT_PRIMARY_KEY, +}; pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader}; use serde::{Deserialize, Serialize}; @@ -87,6 +91,12 @@ impl DocumentsBatchIndex { } } +impl FieldDistribution for DocumentsBatchIndex { + fn id(&self, name: &str) -> Option { + self.id(name) + } +} + #[derive(Debug, thiserror::Error)] pub enum Error { #[error("Error parsing number {value:?} at line {line}: {error}")] diff --git a/milli/src/documents/primary_key.rs b/milli/src/documents/primary_key.rs new file mode 100644 index 000000000..dd97f2608 --- /dev/null +++ b/milli/src/documents/primary_key.rs @@ -0,0 +1,168 @@ +use std::iter; +use std::result::Result as StdResult; + +use serde_json::Value; + +use crate::{FieldId, InternalError, Object, Result, UserError}; + +/// The symbol used to define levels in a nested primary key. +const PRIMARY_KEY_SPLIT_SYMBOL: char = '.'; + +/// The default primary that is used when not specified. +pub const DEFAULT_PRIMARY_KEY: &str = "id"; + +pub trait FieldDistribution { + fn id(&self, name: &str) -> Option; +} + +/// A type that represent the type of primary key that has been set +/// for this index, a classic flat one or a nested one. +#[derive(Debug, Clone, Copy)] +pub enum PrimaryKey<'a> { + Flat { name: &'a str, field_id: FieldId }, + Nested { name: &'a str }, +} + +pub enum DocumentIdExtractionError { + InvalidDocumentId(UserError), + MissingDocumentId, + TooManyDocumentIds(usize), +} + +impl<'a> PrimaryKey<'a> { + pub fn new(path: &'a str, fields: &impl FieldDistribution) -> Option { + Some(if path.contains(PRIMARY_KEY_SPLIT_SYMBOL) { + Self::Nested { name: path } + } else { + let field_id = fields.id(path)?; + Self::Flat { name: path, field_id } + }) + } + + pub fn name(&self) -> &str { + match self { + PrimaryKey::Flat { name, .. } => name, + PrimaryKey::Nested { name } => name, + } + } + + pub fn document_id( + &self, + document: &obkv::KvReader, + fields: &impl FieldDistribution, + ) -> Result> { + match self { + PrimaryKey::Flat { name: _, field_id } => match document.get(*field_id) { + Some(document_id_bytes) => { + let document_id = serde_json::from_slice(document_id_bytes) + .map_err(InternalError::SerdeJson)?; + match validate_document_id_value(document_id)? { + Ok(document_id) => Ok(Ok(document_id)), + Err(user_error) => { + Ok(Err(DocumentIdExtractionError::InvalidDocumentId(user_error))) + } + } + } + None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)), + }, + nested @ PrimaryKey::Nested { .. } => { + let mut matching_documents_ids = Vec::new(); + for (first_level_name, right) in nested.possible_level_names() { + if let Some(field_id) = fields.id(first_level_name) { + if let Some(value_bytes) = document.get(field_id) { + let object = serde_json::from_slice(value_bytes) + .map_err(InternalError::SerdeJson)?; + fetch_matching_values(object, right, &mut matching_documents_ids); + + if matching_documents_ids.len() >= 2 { + return Ok(Err(DocumentIdExtractionError::TooManyDocumentIds( + matching_documents_ids.len(), + ))); + } + } + } + } + + match matching_documents_ids.pop() { + Some(document_id) => match validate_document_id_value(document_id)? { + Ok(document_id) => Ok(Ok(document_id)), + Err(user_error) => { + Ok(Err(DocumentIdExtractionError::InvalidDocumentId(user_error))) + } + }, + None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)), + } + } + } + } + + /// Returns an `Iterator` that gives all the possible fields names the primary key + /// can have depending of the first level name and depth of the objects. + pub fn possible_level_names(&self) -> impl Iterator + '_ { + let name = self.name(); + name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL) + .map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..])) + .chain(iter::once((name, ""))) + } +} + +fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec) { + match value { + Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output), + otherwise => output.push(otherwise), + } +} + +fn fetch_matching_values_in_object( + object: Object, + selector: &str, + base_key: &str, + output: &mut Vec, +) { + for (key, value) in object { + let base_key = if base_key.is_empty() { + key.to_string() + } else { + format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key) + }; + + if starts_with(selector, &base_key) { + match value { + Value::Object(object) => { + fetch_matching_values_in_object(object, selector, &base_key, output) + } + value => output.push(value), + } + } + } +} + +fn starts_with(selector: &str, key: &str) -> bool { + selector.strip_prefix(key).map_or(false, |tail| { + tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true) + }) +} + +// FIXME: move to a DocumentId struct + +fn validate_document_id(document_id: &str) -> Option<&str> { + if !document_id.is_empty() + && document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')) + { + Some(document_id) + } else { + None + } +} + +pub fn validate_document_id_value(document_id: Value) -> Result> { + match document_id { + Value::String(string) => match validate_document_id(&string) { + Some(s) if s.len() == string.len() => Ok(Ok(string)), + Some(s) => Ok(Ok(s.to_string())), + None => Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })), + }, + Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())), + content => Ok(Err(UserError::InvalidDocumentId { document_id: content })), + } +} diff --git a/milli/src/fields_ids_map.rs b/milli/src/fields_ids_map.rs index 810ff755b..85320c168 100644 --- a/milli/src/fields_ids_map.rs +++ b/milli/src/fields_ids_map.rs @@ -81,6 +81,12 @@ impl Default for FieldsIdsMap { } } +impl crate::documents::FieldDistribution for FieldsIdsMap { + fn id(&self, name: &str) -> Option { + self.id(name) + } +} + #[cfg(test)] mod tests { use super::*; From 9cef800b2aa8bceb31bd82ca3bcd11a59157a8dc Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 9 Nov 2023 14:22:05 +0100 Subject: [PATCH 114/127] Enrich uses the new type --- milli/src/update/index_documents/enrich.rs | 207 ++++----------------- milli/src/update/index_documents/mod.rs | 5 +- 2 files changed, 34 insertions(+), 178 deletions(-) diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 22b16f253..03eb3f4de 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -1,20 +1,17 @@ +use std::fmt; use std::io::{BufWriter, Read, Seek}; use std::result::Result as StdResult; -use std::{fmt, iter}; use serde::{Deserialize, Serialize}; use serde_json::Value; -use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader, EnrichedDocumentsBatchReader}; +use crate::documents::{ + DocumentIdExtractionError, DocumentsBatchIndex, DocumentsBatchReader, + EnrichedDocumentsBatchReader, PrimaryKey, DEFAULT_PRIMARY_KEY, +}; use crate::error::{GeoError, InternalError, UserError}; use crate::update::index_documents::{obkv_to_object, writer_into_reader}; -use crate::{FieldId, Index, Object, Result}; - -/// The symbol used to define levels in a nested primary key. -const PRIMARY_KEY_SPLIT_SYMBOL: char = '.'; - -/// The default primary that is used when not specified. -const DEFAULT_PRIMARY_KEY: &str = "id"; +use crate::{FieldId, Index, Result}; /// This function validates and enrich the documents by checking that: /// - we can infer a primary key, @@ -41,14 +38,12 @@ pub fn enrich_documents_batch( // The primary key *field id* that has already been set for this index or the one // we will guess by searching for the first key that contains "id" as a substring. let primary_key = match index.primary_key(rtxn)? { - Some(primary_key) if primary_key.contains(PRIMARY_KEY_SPLIT_SYMBOL) => { - PrimaryKey::nested(primary_key) - } - Some(primary_key) => match documents_batch_index.id(primary_key) { - Some(id) => PrimaryKey::flat(primary_key, id), - None if autogenerate_docids => { - PrimaryKey::flat(primary_key, documents_batch_index.insert(primary_key)) - } + Some(primary_key) => match PrimaryKey::new(primary_key, &documents_batch_index) { + Some(primary_key) => primary_key, + None if autogenerate_docids => PrimaryKey::Flat { + name: primary_key, + field_id: documents_batch_index.insert(primary_key), + }, None => { return match cursor.next_document()? { Some(first_document) => Ok(Err(UserError::MissingDocumentId { @@ -76,14 +71,14 @@ pub fn enrich_documents_batch( }); match guesses.as_slice() { - [] if autogenerate_docids => PrimaryKey::flat( - DEFAULT_PRIMARY_KEY, - documents_batch_index.insert(DEFAULT_PRIMARY_KEY), - ), + [] if autogenerate_docids => PrimaryKey::Flat { + name: DEFAULT_PRIMARY_KEY, + field_id: documents_batch_index.insert(DEFAULT_PRIMARY_KEY), + }, [] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)), [(field_id, name)] => { log::info!("Primary key was not specified in index. Inferred to '{name}'"); - PrimaryKey::flat(name, *field_id) + PrimaryKey::Flat { name, field_id: *field_id } } multiple => { return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound { @@ -156,92 +151,24 @@ fn fetch_or_generate_document_id( uuid_buffer: &mut [u8; uuid::fmt::Hyphenated::LENGTH], count: u32, ) -> Result> { - match primary_key { - PrimaryKey::Flat { name: primary_key, field_id: primary_key_id } => { - match document.get(primary_key_id) { - Some(document_id_bytes) => { - let document_id = serde_json::from_slice(document_id_bytes) - .map_err(InternalError::SerdeJson)?; - match validate_document_id_value(document_id)? { - Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))), - Err(user_error) => Ok(Err(user_error)), - } - } - None if autogenerate_docids => { - let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer); - Ok(Ok(DocumentId::generated(uuid.to_string(), count))) - } - None => Ok(Err(UserError::MissingDocumentId { - primary_key: primary_key.to_string(), - document: obkv_to_object(document, documents_batch_index)?, - })), - } + Ok(match primary_key.document_id(document, documents_batch_index)? { + Ok(document_id) => Ok(DocumentId::Retrieved { value: document_id }), + Err(DocumentIdExtractionError::InvalidDocumentId(user_error)) => Err(user_error), + Err(DocumentIdExtractionError::MissingDocumentId) if autogenerate_docids => { + let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer); + Ok(DocumentId::Generated { value: uuid.to_string(), document_nth: count }) } - nested @ PrimaryKey::Nested { .. } => { - let mut matching_documents_ids = Vec::new(); - for (first_level_name, right) in nested.possible_level_names() { - if let Some(field_id) = documents_batch_index.id(first_level_name) { - if let Some(value_bytes) = document.get(field_id) { - let object = serde_json::from_slice(value_bytes) - .map_err(InternalError::SerdeJson)?; - fetch_matching_values(object, right, &mut matching_documents_ids); - - if matching_documents_ids.len() >= 2 { - return Ok(Err(UserError::TooManyDocumentIds { - primary_key: nested.name().to_string(), - document: obkv_to_object(document, documents_batch_index)?, - })); - } - } - } - } - - match matching_documents_ids.pop() { - Some(document_id) => match validate_document_id_value(document_id)? { - Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))), - Err(user_error) => Ok(Err(user_error)), - }, - None => Ok(Err(UserError::MissingDocumentId { - primary_key: nested.name().to_string(), - document: obkv_to_object(document, documents_batch_index)?, - })), - } + Err(DocumentIdExtractionError::MissingDocumentId) => Err(UserError::MissingDocumentId { + primary_key: primary_key.name().to_string(), + document: obkv_to_object(document, documents_batch_index)?, + }), + Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => { + Err(UserError::TooManyDocumentIds { + primary_key: primary_key.name().to_string(), + document: obkv_to_object(document, documents_batch_index)?, + }) } - } -} - -/// A type that represent the type of primary key that has been set -/// for this index, a classic flat one or a nested one. -#[derive(Debug, Clone, Copy)] -enum PrimaryKey<'a> { - Flat { name: &'a str, field_id: FieldId }, - Nested { name: &'a str }, -} - -impl PrimaryKey<'_> { - fn flat(name: &str, field_id: FieldId) -> PrimaryKey { - PrimaryKey::Flat { name, field_id } - } - - fn nested(name: &str) -> PrimaryKey { - PrimaryKey::Nested { name } - } - - fn name(&self) -> &str { - match self { - PrimaryKey::Flat { name, .. } => name, - PrimaryKey::Nested { name } => name, - } - } - - /// Returns an `Iterator` that gives all the possible fields names the primary key - /// can have depending of the first level name and deepnes of the objects. - fn possible_level_names(&self) -> impl Iterator + '_ { - let name = self.name(); - name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL) - .map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..])) - .chain(iter::once((name, ""))) - } + }) } /// A type that represents a document id that has been retrieved from a document or auto-generated. @@ -255,14 +182,6 @@ pub enum DocumentId { } impl DocumentId { - fn retrieved(value: String) -> DocumentId { - DocumentId::Retrieved { value } - } - - fn generated(value: String, document_nth: u32) -> DocumentId { - DocumentId::Generated { value, document_nth } - } - fn debug(&self) -> String { format!("{:?}", self) } @@ -290,66 +209,6 @@ impl fmt::Debug for DocumentId { } } -fn starts_with(selector: &str, key: &str) -> bool { - selector.strip_prefix(key).map_or(false, |tail| { - tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true) - }) -} - -pub fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec) { - match value { - Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output), - otherwise => output.push(otherwise), - } -} - -pub fn fetch_matching_values_in_object( - object: Object, - selector: &str, - base_key: &str, - output: &mut Vec, -) { - for (key, value) in object { - let base_key = if base_key.is_empty() { - key.to_string() - } else { - format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key) - }; - - if starts_with(selector, &base_key) { - match value { - Value::Object(object) => { - fetch_matching_values_in_object(object, selector, &base_key, output) - } - value => output.push(value), - } - } - } -} - -pub fn validate_document_id(document_id: &str) -> Option<&str> { - if !document_id.is_empty() - && document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')) - { - Some(document_id) - } else { - None - } -} - -/// Parses a Json encoded document id and validate it, returning a user error when it is one. -pub fn validate_document_id_value(document_id: Value) -> Result> { - match document_id { - Value::String(string) => match validate_document_id(&string) { - Some(s) if s.len() == string.len() => Ok(Ok(string)), - Some(s) => Ok(Ok(s.to_string())), - None => Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })), - }, - Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())), - content => Ok(Err(UserError::InvalidDocumentId { document_id: content })), - } -} - /// Try to extract an `f64` from a JSON `Value` and return the `Value` /// in the `Err` variant if it failed. pub fn extract_finite_float_from_value(value: Value) -> StdResult { diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 2be410ace..d60006289 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -20,10 +20,7 @@ use slice_group_by::GroupBy; use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; use self::enrich::enrich_documents_batch; -pub use self::enrich::{ - extract_finite_float_from_value, validate_document_id, validate_document_id_value, - validate_geo_from_json, DocumentId, -}; +pub use self::enrich::{extract_finite_float_from_value, validate_geo_from_json, DocumentId}; pub use self::helpers::{ as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, From b11c2afac09bd1eae5a1f73e97efe1651add7e67 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 9 Nov 2023 14:22:43 +0100 Subject: [PATCH 115/127] Index::external_id_of --- milli/src/index.rs | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/milli/src/index.rs b/milli/src/index.rs index 86ef6105b..5b705e0b2 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -12,6 +12,7 @@ use rstar::RTree; use time::OffsetDateTime; use crate::distance::NDotProductPoint; +use crate::documents::PrimaryKey; use crate::error::{InternalError, UserError}; use crate::fields_ids_map::FieldsIdsMap; use crate::heed_codec::facet::{ @@ -1176,6 +1177,36 @@ impl Index { self.iter_documents(rtxn, self.documents_ids(rtxn)?) } + pub fn external_id_of<'a, 't: 'a>( + &'a self, + rtxn: &'t RoTxn, + ids: impl IntoIterator + 'a, + ) -> Result> + 'a> { + let fields = self.fields_ids_map(rtxn)?; + + // uses precondition "never called on an empty index" + let primary_key = self.primary_key(rtxn)?.ok_or(InternalError::DatabaseMissingEntry { + db_name: db_name::MAIN, + key: Some(main_key::PRIMARY_KEY_KEY), + })?; + let primary_key = PrimaryKey::new(primary_key, &fields).ok_or_else(|| { + InternalError::FieldIdMapMissingEntry(crate::FieldIdMapMissingEntry::FieldName { + field_name: primary_key.to_owned(), + process: "external_id_of", + }) + })?; + Ok(self.iter_documents(rtxn, ids)?.map(move |entry| -> Result<_> { + let (_docid, obkv) = entry?; + match primary_key.document_id(&obkv, &fields)? { + Ok(document_id) => Ok(document_id), + Err(_) => Err(InternalError::DocumentsError( + crate::documents::Error::InvalidDocumentFormat, + ) + .into()), + } + })) + } + pub fn facets_distribution<'a>(&'a self, rtxn: &'a RoTxn) -> FacetDistribution<'a> { FacetDistribution::new(rtxn, self) } From 3053e01c05df5d840c1d4efe4810cdafef5a8c70 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 9 Nov 2023 14:23:02 +0100 Subject: [PATCH 116/127] Batch::remove_documents_from_db_no_batch --- milli/src/update/index_documents/mod.rs | 33 ++++++++ milli/src/update/index_documents/transform.rs | 83 +++++++++++++++++++ 2 files changed, 116 insertions(+) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index d60006289..de40e0b9b 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -194,6 +194,39 @@ where Ok((self, Ok(deleted_documents))) } + /// Removes documents from db using their internal document ids. + /// + /// # Warning + /// + /// This function is dangerous and will only work correctly if: + /// + /// - All the passed ids currently exist in the database + /// - No batching using the standards `remove_documents` and `add_documents` took place + /// + /// TODO: make it impossible to call `remove_documents` or `add_documents` on an instance that calls this function. + pub fn remove_documents_from_db_no_batch( + mut self, + to_delete: &RoaringBitmap, + ) -> Result<(Self, u64)> { + puffin::profile_function!(); + + // Early return when there is no document to add + if to_delete.is_empty() { + return Ok((self, 0)); + } + + let deleted_documents = self + .transform + .as_mut() + .expect("Invalid document deletion state") + .remove_documents_from_db_no_batch(to_delete, self.wtxn, &self.should_abort)? + as u64; + + self.deleted_documents += deleted_documents; + + Ok((self, deleted_documents)) + } + #[logging_timer::time("IndexDocuments::{}")] pub fn execute(mut self) -> Result { puffin::profile_function!(); diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 186974bfe..5f5e698d3 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -481,6 +481,89 @@ impl<'a, 'i> Transform<'a, 'i> { Ok(documents_deleted) } + /// The counter part of `read_documents` that removes documents either from the transform or the database. + /// It can be called before, after or in between two calls of the `read_documents`. + /// + /// It needs to update all the internal datastructure in the transform. + /// - If the document is coming from the database -> it's marked as a to_delete document + /// - If the document to remove was inserted by the `read_documents` method before AND was present in the db, + /// it's marked as `to_delete` + added into the grenad to ensure we don't reinsert it. + /// - If the document to remove was inserted by the `read_documents` method before but was NOT present in the db, + /// it's added into the grenad to ensure we don't insert it + removed from the list of new documents ids. + /// - If the document to remove was not present in either the db or the transform we do nothing. + #[logging_timer::time] + pub fn remove_documents_from_db_no_batch( + &mut self, + to_remove: &RoaringBitmap, + wtxn: &mut heed::RwTxn, + should_abort: FA, + ) -> Result + where + FA: Fn() -> bool + Sync, + { + puffin::profile_function!(); + + let mut documents_deleted = 0; + let mut document_sorter_value_buffer = Vec::new(); + let mut document_sorter_key_buffer = Vec::new(); + let external_ids = self.index.external_id_of(wtxn, to_remove.iter())?; + + for (to_remove, external_docid) in to_remove.iter().zip(external_ids) { + let external_docid = external_docid?; + if should_abort() { + return Err(Error::InternalError(InternalError::AbortedIndexation)); + } + self.replaced_documents_ids.insert(to_remove); + + // fetch the obkv document + let original_key = BEU32::new(to_remove); + let base_obkv = self + .index + .documents + .remap_data_type::() + .get(wtxn, &original_key)? + .ok_or(InternalError::DatabaseMissingEntry { + db_name: db_name::DOCUMENTS, + key: None, + })?; + + // Key is the concatenation of the internal docid and the external one. + document_sorter_key_buffer.clear(); + document_sorter_key_buffer.extend_from_slice(&to_remove.to_be_bytes()); + document_sorter_key_buffer.extend_from_slice(external_docid.as_bytes()); + // push it as to delete in the original_sorter + document_sorter_value_buffer.clear(); + document_sorter_value_buffer.push(Operation::Deletion as u8); + into_del_add_obkv( + KvReaderU16::new(base_obkv), + true, + false, + &mut document_sorter_value_buffer, + )?; + self.original_sorter + .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; + + // flatten it and push it as to delete in the flattened_sorter + let flattened_obkv = KvReader::new(base_obkv); + if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? { + // we recreate our buffer with the flattened documents + document_sorter_value_buffer.clear(); + document_sorter_value_buffer.push(Operation::Deletion as u8); + into_del_add_obkv( + KvReaderU16::new(&obkv), + true, + false, + &mut document_sorter_value_buffer, + )?; + } + self.flattened_sorter.insert(to_remove.to_be_bytes(), &document_sorter_value_buffer)?; + + documents_deleted += 1; + } + + Ok(documents_deleted) + } + // Flatten a document from the fields ids map contained in self and insert the new // created fields. Returns `None` if the document doesn't need to be flattened. fn flatten_from_fields_ids_map(&mut self, obkv: KvReader) -> Result>> { From f8289cd974d957d38645ca66c993ca518ec81955 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 9 Nov 2023 14:23:15 +0100 Subject: [PATCH 117/127] Use it from delete-by-filter --- index-scheduler/src/batch.rs | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index c9deedb37..5260a9d7e 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -1534,18 +1534,6 @@ fn delete_document_by_filter<'a>( } e => e.into(), })?; - let external_documents_ids = index.external_documents_ids(); - // FIXME: for filters matching a lot of documents, this will allocate a huge vec of external docids (strings). - // Since what we have is an iterator, it would be better to delete in chunks - let external_to_internal: std::result::Result, RoaringBitmap> = - external_documents_ids - .find_external_id_of(wtxn, candidates)? - .only_external_ids() - .collect(); - let document_ids = match external_to_internal { - Ok(external_ids) => external_ids, - Err(remaining_ids) => panic!("Couldn't find some external ids {:?}", remaining_ids), - }; let config = IndexDocumentsConfig { update_method: IndexDocumentsMethod::ReplaceDocuments, @@ -1561,13 +1549,10 @@ fn delete_document_by_filter<'a>( || must_stop_processing.get(), )?; - let (new_builder, user_result) = builder.remove_documents(document_ids)?; + let (new_builder, count) = builder.remove_documents_from_db_no_batch(&candidates)?; builder = new_builder; - // Uses Invariant: remove documents actually always returns Ok for the inner result - let count = user_result.unwrap(); let _ = builder.execute()?; - count } else { 0 From 825257da76b33809cbb0496773449c63de023260 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 9 Nov 2023 16:13:15 +0100 Subject: [PATCH 118/127] Use more efficient method for deletion in benchmarks --- benchmarks/benches/indexing.rs | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index c31bfab89..65f581b93 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -864,22 +864,12 @@ fn delete_documents_from_ids(index: Index, document_ids_to_delete: Vec, RoaringBitmap> = - external_documents_ids - .find_external_id_of(&wtxn, ids) - .unwrap() - .only_external_ids() - .collect(); - let ids = external_to_internal.unwrap(); let config = IndexDocumentsConfig::default(); let mut builder = IndexDocuments::new(&mut wtxn, &index, &indexer_config, config, |_| (), || false) .unwrap(); - (builder, _) = builder.remove_documents(ids).unwrap(); + (builder, _) = builder.remove_documents_from_db_no_batch(&ids).unwrap(); builder.execute().unwrap(); } From 264b10ec20956cfe599cfeb5e3fc08ae2298bfc8 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 9 Nov 2023 16:23:20 +0100 Subject: [PATCH 119/127] Fixup documentation --- milli/src/update/index_documents/transform.rs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 5f5e698d3..23313547a 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -481,16 +481,16 @@ impl<'a, 'i> Transform<'a, 'i> { Ok(documents_deleted) } - /// The counter part of `read_documents` that removes documents either from the transform or the database. - /// It can be called before, after or in between two calls of the `read_documents`. + /// Removes documents from db using their internal document ids. /// - /// It needs to update all the internal datastructure in the transform. - /// - If the document is coming from the database -> it's marked as a to_delete document - /// - If the document to remove was inserted by the `read_documents` method before AND was present in the db, - /// it's marked as `to_delete` + added into the grenad to ensure we don't reinsert it. - /// - If the document to remove was inserted by the `read_documents` method before but was NOT present in the db, - /// it's added into the grenad to ensure we don't insert it + removed from the list of new documents ids. - /// - If the document to remove was not present in either the db or the transform we do nothing. + /// # Warning + /// + /// This function is dangerous and will only work correctly if: + /// + /// - All the passed ids currently exist in the database + /// - No batching using the standards `remove_documents` and `add_documents` took place + /// + /// TODO: make it impossible to call `remove_documents` or `add_documents` on an instance that calls this function. #[logging_timer::time] pub fn remove_documents_from_db_no_batch( &mut self, From 1f364105419170f8c5a65a57e23f43a45c58725d Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 13 Nov 2023 13:36:39 +0100 Subject: [PATCH 120/127] Update tests --- milli/src/search/new/tests/proximity.rs | 18 +++++++++--------- ...sts__proximity__proximity_prefix_db-14.snap | 18 +++++++++--------- ...ests__proximity__proximity_prefix_db-2.snap | 18 +++++++++--------- ...ests__proximity__proximity_prefix_db-8.snap | 18 +++++++++--------- milli/src/snapshot_tests.rs | 16 ---------------- 5 files changed, 36 insertions(+), 52 deletions(-) diff --git a/milli/src/search/new/tests/proximity.rs b/milli/src/search/new/tests/proximity.rs index 217ebe9b3..2d181a537 100644 --- a/milli/src/search/new/tests/proximity.rs +++ b/milli/src/search/new/tests/proximity.rs @@ -371,7 +371,7 @@ fn test_proximity_prefix_db() { s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed); s.query("best s"); let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 6, 7, 8, 11, 15]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 9, 6, 7, 8, 11, 12, 13, 15]"); insta::assert_snapshot!(format!("{document_scores:#?}")); let texts = collect_field_values(&index, &txn, "text", &documents_ids); @@ -379,13 +379,13 @@ fn test_proximity_prefix_db() { insta::assert_debug_snapshot!(texts, @r###" [ "\"this is the best summer meal\"", - "\"summer best\"", "\"this is the best meal of summer\"", - "\"summer x best\"", "\"this is the best meal I have ever had in such a beautiful summer day\"", "\"this is the best cooked meal of the summer\"", "\"this is the best meal of the summer\"", "\"summer x y best\"", + "\"summer x best\"", + "\"summer best\"", "\"this is the best meal I have ever had in such a beautiful winter day\"", ] "###); @@ -423,20 +423,20 @@ fn test_proximity_prefix_db() { s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed); s.query("best win"); let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 18, 15, 16, 17, 20, 21, 22]"); insta::assert_snapshot!(format!("{document_scores:#?}")); let texts = collect_field_values(&index, &txn, "text", &documents_ids); insta::assert_debug_snapshot!(texts, @r###" [ "\"this is the best winter meal\"", - "\"winter best\"", "\"this is the best meal of winter\"", - "\"winter x best\"", "\"this is the best meal I have ever had in such a beautiful winter day\"", "\"this is the best cooked meal of the winter\"", "\"this is the best meal of the winter\"", "\"winter x y best\"", + "\"winter x best\"", + "\"winter best\"", ] "###); @@ -471,20 +471,20 @@ fn test_proximity_prefix_db() { s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed); s.query("best wi"); let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 18, 15, 16, 17, 20, 21, 22]"); insta::assert_snapshot!(format!("{document_scores:#?}")); let texts = collect_field_values(&index, &txn, "text", &documents_ids); insta::assert_debug_snapshot!(texts, @r###" [ "\"this is the best winter meal\"", - "\"winter best\"", "\"this is the best meal of winter\"", - "\"winter x best\"", "\"this is the best meal I have ever had in such a beautiful winter day\"", "\"this is the best cooked meal of the winter\"", "\"this is the best meal of the winter\"", "\"winter x y best\"", + "\"winter x best\"", + "\"winter best\"", ] "###); } diff --git a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-14.snap b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-14.snap index 8f3b964c1..efcfef7f1 100644 --- a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-14.snap +++ b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-14.snap @@ -11,14 +11,6 @@ expression: "format!(\"{document_scores:#?}\")" }, ), ], - [ - Proximity( - Rank { - rank: 3, - max_rank: 4, - }, - ), - ], [ Proximity( Rank { @@ -30,7 +22,15 @@ expression: "format!(\"{document_scores:#?}\")" [ Proximity( Rank { - rank: 2, + rank: 1, + max_rank: 4, + }, + ), + ], + [ + Proximity( + Rank { + rank: 1, max_rank: 4, }, ), diff --git a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-2.snap b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-2.snap index 1ee6bfc91..242bc3424 100644 --- a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-2.snap +++ b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-2.snap @@ -11,14 +11,6 @@ expression: "format!(\"{document_scores:#?}\")" }, ), ], - [ - Proximity( - Rank { - rank: 3, - max_rank: 4, - }, - ), - ], [ Proximity( Rank { @@ -30,7 +22,15 @@ expression: "format!(\"{document_scores:#?}\")" [ Proximity( Rank { - rank: 2, + rank: 1, + max_rank: 4, + }, + ), + ], + [ + Proximity( + Rank { + rank: 1, max_rank: 4, }, ), diff --git a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-8.snap b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-8.snap index 8f3b964c1..efcfef7f1 100644 --- a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-8.snap +++ b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-8.snap @@ -11,14 +11,6 @@ expression: "format!(\"{document_scores:#?}\")" }, ), ], - [ - Proximity( - Rank { - rank: 3, - max_rank: 4, - }, - ), - ], [ Proximity( Rank { @@ -30,7 +22,15 @@ expression: "format!(\"{document_scores:#?}\")" [ Proximity( Rank { - rank: 2, + rank: 1, + max_rank: 4, + }, + ), + ], + [ + Proximity( + Rank { + rank: 1, max_rank: 4, }, ), diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index f3f1eb5a5..28c4cb45c 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -219,22 +219,6 @@ pub fn snap_word_pair_proximity_docids(index: &Index) -> String { &format!("{proximity:<2} {word1:<16} {word2:<16} {}", display_bitmap(&b)) }) } -pub fn snap_word_prefix_pair_proximity_docids(index: &Index) -> String { - make_db_snap_from_iter!(index, word_prefix_pair_proximity_docids, |( - (proximity, word1, prefix), - b, - )| { - &format!("{proximity:<2} {word1:<16} {prefix:<4} {}", display_bitmap(&b)) - }) -} -pub fn snap_prefix_word_pair_proximity_docids(index: &Index) -> String { - make_db_snap_from_iter!(index, prefix_word_pair_proximity_docids, |( - (proximity, prefix, word2), - b, - )| { - &format!("{proximity:<2} {prefix:<4} {word2:<16} {}", display_bitmap(&b)) - }) -} pub fn snap_word_position_docids(index: &Index) -> String { make_db_snap_from_iter!(index, word_position_docids, |((word, position), b)| { &format!("{word:<16} {position:<6} {}", display_bitmap(&b)) From 378deb0bef48269ee373fc3a6426e24c80393b2e Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 13 Nov 2023 13:37:58 +0100 Subject: [PATCH 121/127] Rename trait --- milli/src/documents/mod.rs | 6 ++---- milli/src/documents/primary_key.rs | 10 +++++++--- milli/src/fields_ids_map.rs | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index 4429f083d..a874ac17e 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -12,9 +12,7 @@ use bimap::BiHashMap; pub use builder::DocumentsBatchBuilder; pub use enriched::{EnrichedDocument, EnrichedDocumentsBatchCursor, EnrichedDocumentsBatchReader}; use obkv::KvReader; -pub use primary_key::{ - DocumentIdExtractionError, FieldDistribution, PrimaryKey, DEFAULT_PRIMARY_KEY, -}; +pub use primary_key::{DocumentIdExtractionError, FieldIdMapper, PrimaryKey, DEFAULT_PRIMARY_KEY}; pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader}; use serde::{Deserialize, Serialize}; @@ -91,7 +89,7 @@ impl DocumentsBatchIndex { } } -impl FieldDistribution for DocumentsBatchIndex { +impl FieldIdMapper for DocumentsBatchIndex { fn id(&self, name: &str) -> Option { self.id(name) } diff --git a/milli/src/documents/primary_key.rs b/milli/src/documents/primary_key.rs index dd97f2608..16a95c21f 100644 --- a/milli/src/documents/primary_key.rs +++ b/milli/src/documents/primary_key.rs @@ -11,7 +11,11 @@ const PRIMARY_KEY_SPLIT_SYMBOL: char = '.'; /// The default primary that is used when not specified. pub const DEFAULT_PRIMARY_KEY: &str = "id"; -pub trait FieldDistribution { +/// Trait for objects that can map the name of a field to its [`FieldId`]. +pub trait FieldIdMapper { + /// Attempts to map the passed name to its [`FieldId`]. + /// + /// `None` if the field with this name was not found. fn id(&self, name: &str) -> Option; } @@ -30,7 +34,7 @@ pub enum DocumentIdExtractionError { } impl<'a> PrimaryKey<'a> { - pub fn new(path: &'a str, fields: &impl FieldDistribution) -> Option { + pub fn new(path: &'a str, fields: &impl FieldIdMapper) -> Option { Some(if path.contains(PRIMARY_KEY_SPLIT_SYMBOL) { Self::Nested { name: path } } else { @@ -49,7 +53,7 @@ impl<'a> PrimaryKey<'a> { pub fn document_id( &self, document: &obkv::KvReader, - fields: &impl FieldDistribution, + fields: &impl FieldIdMapper, ) -> Result> { match self { PrimaryKey::Flat { name: _, field_id } => match document.get(*field_id) { diff --git a/milli/src/fields_ids_map.rs b/milli/src/fields_ids_map.rs index 85320c168..9c1c87f82 100644 --- a/milli/src/fields_ids_map.rs +++ b/milli/src/fields_ids_map.rs @@ -81,7 +81,7 @@ impl Default for FieldsIdsMap { } } -impl crate::documents::FieldDistribution for FieldsIdsMap { +impl crate::documents::FieldIdMapper for FieldsIdsMap { fn id(&self, name: &str) -> Option { self.id(name) } From 772964125d6f59ef4eaa1957c305211f45f07526 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 13 Nov 2023 13:51:22 +0100 Subject: [PATCH 122/127] Factor removal of document from DB --- milli/src/update/index_documents/transform.rs | 143 +++++++----------- 1 file changed, 56 insertions(+), 87 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 23313547a..8dc88efb9 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -421,52 +421,13 @@ impl<'a, 'i> Transform<'a, 'i> { // Then we push the document in sorters in deletion mode. let deleted_from_db = match external_documents_ids.get(wtxn, &to_remove)? { Some(docid) => { - self.replaced_documents_ids.insert(docid); - - // fetch the obkv document - let original_key = BEU32::new(docid); - let base_obkv = self - .index - .documents - .remap_data_type::() - .get(wtxn, &original_key)? - .ok_or(InternalError::DatabaseMissingEntry { - db_name: db_name::DOCUMENTS, - key: None, - })?; - - // Key is the concatenation of the internal docid and the external one. - document_sorter_key_buffer.clear(); - document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); - document_sorter_key_buffer.extend_from_slice(to_remove.as_bytes()); - // push it as to delete in the original_sorter - document_sorter_value_buffer.clear(); - document_sorter_value_buffer.push(Operation::Deletion as u8); - into_del_add_obkv( - KvReaderU16::new(base_obkv), - true, - false, + self.remove_document_from_db( + docid, + to_remove, + wtxn, + &mut document_sorter_key_buffer, &mut document_sorter_value_buffer, )?; - self.original_sorter - .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; - - // flatten it and push it as to delete in the flattened_sorter - let flattened_obkv = KvReader::new(base_obkv); - if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? { - // we recreate our buffer with the flattened documents - document_sorter_value_buffer.clear(); - document_sorter_value_buffer.push(Operation::Deletion as u8); - into_del_add_obkv( - KvReaderU16::new(&obkv), - true, - false, - &mut document_sorter_value_buffer, - )?; - } - self.flattened_sorter - .insert(docid.to_be_bytes(), &document_sorter_value_buffer)?; - true } None => false, @@ -508,55 +469,18 @@ impl<'a, 'i> Transform<'a, 'i> { let mut document_sorter_key_buffer = Vec::new(); let external_ids = self.index.external_id_of(wtxn, to_remove.iter())?; - for (to_remove, external_docid) in to_remove.iter().zip(external_ids) { + for (internal_docid, external_docid) in to_remove.iter().zip(external_ids) { let external_docid = external_docid?; if should_abort() { return Err(Error::InternalError(InternalError::AbortedIndexation)); } - self.replaced_documents_ids.insert(to_remove); - - // fetch the obkv document - let original_key = BEU32::new(to_remove); - let base_obkv = self - .index - .documents - .remap_data_type::() - .get(wtxn, &original_key)? - .ok_or(InternalError::DatabaseMissingEntry { - db_name: db_name::DOCUMENTS, - key: None, - })?; - - // Key is the concatenation of the internal docid and the external one. - document_sorter_key_buffer.clear(); - document_sorter_key_buffer.extend_from_slice(&to_remove.to_be_bytes()); - document_sorter_key_buffer.extend_from_slice(external_docid.as_bytes()); - // push it as to delete in the original_sorter - document_sorter_value_buffer.clear(); - document_sorter_value_buffer.push(Operation::Deletion as u8); - into_del_add_obkv( - KvReaderU16::new(base_obkv), - true, - false, + self.remove_document_from_db( + internal_docid, + external_docid, + wtxn, + &mut document_sorter_key_buffer, &mut document_sorter_value_buffer, )?; - self.original_sorter - .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; - - // flatten it and push it as to delete in the flattened_sorter - let flattened_obkv = KvReader::new(base_obkv); - if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? { - // we recreate our buffer with the flattened documents - document_sorter_value_buffer.clear(); - document_sorter_value_buffer.push(Operation::Deletion as u8); - into_del_add_obkv( - KvReaderU16::new(&obkv), - true, - false, - &mut document_sorter_value_buffer, - )?; - } - self.flattened_sorter.insert(to_remove.to_be_bytes(), &document_sorter_value_buffer)?; documents_deleted += 1; } @@ -564,6 +488,51 @@ impl<'a, 'i> Transform<'a, 'i> { Ok(documents_deleted) } + fn remove_document_from_db( + &mut self, + internal_docid: u32, + external_docid: String, + txn: &heed::RoTxn, + document_sorter_key_buffer: &mut Vec, + document_sorter_value_buffer: &mut Vec, + ) -> Result<()> { + self.replaced_documents_ids.insert(internal_docid); + + // fetch the obkv document + let original_key = BEU32::new(internal_docid); + let base_obkv = self + .index + .documents + .remap_data_type::() + .get(txn, &original_key)? + .ok_or(InternalError::DatabaseMissingEntry { + db_name: db_name::DOCUMENTS, + key: None, + })?; + + // Key is the concatenation of the internal docid and the external one. + document_sorter_key_buffer.clear(); + document_sorter_key_buffer.extend_from_slice(&internal_docid.to_be_bytes()); + document_sorter_key_buffer.extend_from_slice(external_docid.as_bytes()); + // push it as to delete in the original_sorter + document_sorter_value_buffer.clear(); + document_sorter_value_buffer.push(Operation::Deletion as u8); + into_del_add_obkv(KvReaderU16::new(base_obkv), true, false, document_sorter_value_buffer)?; + self.original_sorter.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; + + // flatten it and push it as to delete in the flattened_sorter + let flattened_obkv = KvReader::new(base_obkv); + if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? { + // we recreate our buffer with the flattened documents + document_sorter_value_buffer.clear(); + document_sorter_value_buffer.push(Operation::Deletion as u8); + into_del_add_obkv(KvReaderU16::new(&obkv), true, false, document_sorter_value_buffer)?; + } + self.flattened_sorter + .insert(internal_docid.to_be_bytes(), &document_sorter_value_buffer)?; + Ok(()) + } + // Flatten a document from the fields ids map contained in self and insert the new // created fields. Returns `None` if the document doesn't need to be flattened. fn flatten_from_fields_ids_map(&mut self, obkv: KvReader) -> Result>> { From 263e82561973020c112cd7d74a76c479d76de1c8 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 20 Nov 2023 10:06:29 +0100 Subject: [PATCH 123/127] Fix typos in comments --- .../extract/extract_docid_word_positions.rs | 8 ++++---- .../extract/extract_fid_docid_facet_values.rs | 4 ++-- milli/src/update/index_documents/mod.rs | 1 - milli/src/update/index_documents/typed_chunk.rs | 2 +- 4 files changed, 7 insertions(+), 8 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 0dcd6a42a..303b64271 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -204,7 +204,7 @@ fn tokenizer_builder<'a>( tokenizer_builder } -/// Extract words maped with their positions of a document, +/// Extract words mapped with their positions of a document, /// ensuring no Language detection mistakes was made. #[allow(clippy::too_many_arguments)] // FIXME: consider grouping arguments in a struct fn lang_safe_tokens_from_document<'a>( @@ -273,7 +273,7 @@ fn lang_safe_tokens_from_document<'a>( Ok((&buffers.obkv_buffer, script_language_word_count)) } -/// Extract words maped with their positions of a document. +/// Extract words mapped with their positions of a document. fn tokens_from_document<'a>( obkv: &KvReader, searchable_fields: &Option>, @@ -294,11 +294,11 @@ fn tokens_from_document<'a>( let value = serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; - // prepare writting destination. + // prepare writing destination. buffers.obkv_positions_buffer.clear(); let mut writer = KvWriterU16::new(&mut buffers.obkv_positions_buffer); - // convert json into an unique string. + // convert json into a unique string. buffers.field_buffer.clear(); if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) { // create an iterator of token with their positions. diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 2dce90cfc..3fcec3e79 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -75,7 +75,7 @@ pub fn extract_fid_docid_facet_values( let mut facet_is_null_docids = BTreeMap::::new(); let mut facet_is_empty_docids = BTreeMap::::new(); - // We create two buffer for mutable ref issues with closures. + // We create two buffers for mutable ref issues with closures. let mut numbers_key_buffer = Vec::new(); let mut strings_key_buffer = Vec::new(); @@ -333,7 +333,7 @@ where key_buffer.extend_from_slice(&value_bytes); key_buffer.extend_from_slice(&number.to_be_bytes()); - // We insert only the Del part of the Obkv to inform + // We insert only the Add part of the Obkv to inform // that we only want to remove all those numbers. let mut obkv = KvWriterDelAdd::memory(); obkv.insert(DelAdd::Addition, ().as_bytes())?; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 2289666ed..113114681 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -449,7 +449,6 @@ where otherwise => otherwise, }; - // FIXME: return newly added as well as newly deleted documents let (docids, is_merged_database) = write_typed_chunk_into_index(typed_chunk, self.index, self.wtxn, index_is_empty)?; if !docids.is_empty() { diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 90f9b7739..dda2ebc1c 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -409,7 +409,7 @@ pub(crate) fn write_typed_chunk_into_index( dims.into_iter().max_by_key(|(_, count)| *count).map(|(len, _)| len) }; - // Ensure that the vector lenghts are correct and + // Ensure that the vector lengths are correct and // prepare the vectors before inserting them in the HNSW. let mut points = Vec::new(); let mut docids = Vec::new(); From d59b7db8d09bb5881adc223c59cc2e3c8a999546 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 20 Nov 2023 10:10:05 +0100 Subject: [PATCH 124/127] remove unused code --- milli/src/external_documents_ids.rs | 71 ------------------- .../helpers/merge_functions.rs | 12 ---- .../src/update/index_documents/helpers/mod.rs | 14 ++-- 3 files changed, 4 insertions(+), 93 deletions(-) diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs index 0e4891649..ec419446c 100644 --- a/milli/src/external_documents_ids.rs +++ b/milli/src/external_documents_ids.rs @@ -2,7 +2,6 @@ use std::collections::HashMap; use heed::types::{OwnedType, Str}; use heed::{Database, RoIter, RoTxn, RwTxn}; -use roaring::RoaringBitmap; use crate::{DocumentId, BEU32}; @@ -44,23 +43,6 @@ impl ExternalDocumentsIds { Ok(map) } - /// Looks for the internal ids in the passed bitmap, and returns an iterator over the mapping between - /// these internal ids and their external id. - /// - /// The returned iterator has `Result<(String, DocumentId), RoaringBitmap>` as `Item`, - /// where the returned values can be: - /// - `Ok((external_id, internal_id))`: if a mapping was found - /// - `Err(remaining_ids)`: if the external ids for some of the requested internal ids weren't found. - /// In that case the returned bitmap contains the internal ids whose external ids were not found after traversing - /// the entire fst. - pub fn find_external_id_of<'t>( - &self, - rtxn: &'t RoTxn, - internal_ids: RoaringBitmap, - ) -> heed::Result> { - self.0.iter(rtxn).map(|iter| ExternalToInternalOwnedIterator { iter, internal_ids }) - } - /// Applies the list of operations passed as argument, modifying the current external to internal id mapping. /// /// If the list contains multiple operations on the same external id, then the result is unspecified. @@ -91,56 +73,3 @@ impl ExternalDocumentsIds { self.0.iter(rtxn) } } - -/// An iterator over mappings between requested internal ids and external ids. -/// -/// See [`ExternalDocumentsIds::find_external_id_of`] for details. -pub struct ExternalToInternalOwnedIterator<'t> { - iter: RoIter<'t, Str, OwnedType>, - internal_ids: RoaringBitmap, -} - -impl<'t> Iterator for ExternalToInternalOwnedIterator<'t> { - /// A result indicating if a mapping was found, or if the stream was exhausted without finding all internal ids. - type Item = Result<(&'t str, DocumentId), RoaringBitmap>; - - fn next(&mut self) -> Option { - // if all requested ids were found, we won't find any other, so short-circuit - if self.internal_ids.is_empty() { - return None; - } - loop { - let (external, internal) = match self.iter.next() { - Some(Ok((external, internal))) => (external, internal), - // TODO manage this better, remove panic - Some(Err(e)) => panic!("{}", e), - _ => { - // we exhausted the stream but we still have some internal ids to find - let remaining_ids = std::mem::take(&mut self.internal_ids); - return Some(Err(remaining_ids)); - // note: next calls to `next` will return `None` since we replaced the internal_ids - // with the default empty bitmap - } - }; - let internal = internal.get(); - let was_contained = self.internal_ids.remove(internal); - if was_contained { - return Some(Ok((external, internal))); - } - } - } -} - -impl<'t> ExternalToInternalOwnedIterator<'t> { - /// Returns the bitmap of internal ids whose external id are yet to be found - pub fn remaining_internal_ids(&self) -> &RoaringBitmap { - &self.internal_ids - } - - /// Consumes this iterator and returns an iterator over only the external ids, ignoring the internal ids. - /// - /// Use this when you don't need the mapping between the external and the internal ids. - pub fn only_external_ids(self) -> impl Iterator> + 't { - self.map(|res| res.map(|(external, _internal)| external.to_owned())) - } -} diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs index 5d9ca7ef2..d355ead68 100644 --- a/milli/src/update/index_documents/helpers/merge_functions.rs +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -12,18 +12,6 @@ use crate::Result; pub type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> Result>; -#[allow(unused)] -pub fn concat_u32s_array<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { - if values.len() == 1 { - Ok(values[0].clone()) - } else { - let capacity = values.iter().map(|v| v.len()).sum::(); - let mut output = Vec::with_capacity(capacity); - values.iter().for_each(|integers| output.extend_from_slice(integers)); - Ok(Cow::Owned(output)) - } -} - pub fn serialize_roaring_bitmap(bitmap: &RoaringBitmap, buffer: &mut Vec) -> io::Result<()> { buffer.clear(); buffer.reserve(bitmap.serialized_size()); diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index 841c09543..52638d6f6 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -13,11 +13,10 @@ pub use grenad_helpers::{ GrenadParameters, MergeableReader, }; pub use merge_functions::{ - concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string, - merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps, - merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_roaring_bitmaps, - obkvs_keep_last_addition_merge_deletions, obkvs_merge_additions_and_deletions, - serialize_roaring_bitmap, MergeFn, + keep_first, keep_latest_obkv, merge_btreeset_string, merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, + merge_roaring_bitmaps, obkvs_keep_last_addition_merge_deletions, + obkvs_merge_additions_and_deletions, serialize_roaring_bitmap, MergeFn, }; use crate::MAX_WORD_LENGTH; @@ -46,11 +45,6 @@ where Some((head, tail)) } -#[allow(unused)] -pub fn read_u32_ne_bytes(bytes: &[u8]) -> impl Iterator + '_ { - bytes.chunks_exact(4).flat_map(TryInto::try_into).map(u32::from_ne_bytes) -} - /// Converts an fst Stream into an HashSet of Strings. pub fn fst_stream_into_hashset<'f, I, S>(stream: I) -> HashSet> where From ebef6bc24db04dac8b463c820b372d1895ea7584 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 20 Nov 2023 10:14:57 +0100 Subject: [PATCH 125/127] Simplify documents database writing --- milli/src/update/index_documents/typed_chunk.rs | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index dda2ebc1c..4f9f0ef6f 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -140,20 +140,9 @@ pub(crate) fn write_typed_chunk_into_index( for (field_id, value) in reader.iter() { let del_add_reader = KvReaderDelAdd::new(value); - match ( - del_add_reader.get(DelAdd::Deletion), - del_add_reader.get(DelAdd::Addition), - ) { - (None, None) => {} - (None, Some(value)) => { - // anyway, write - writer.insert(field_id, value)?; - } - (Some(_), None) => {} - (Some(_), Some(value)) => { - // updated field, write - writer.insert(field_id, value)?; - } + + if let Some(addition) = del_add_reader.get(DelAdd::Addition) { + writer.insert(field_id, addition)?; } } From 39cbb499c2db09fe2385a5bdb2f294298d5b6366 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 20 Nov 2023 10:20:39 +0100 Subject: [PATCH 126/127] Small fixes --- milli/src/update/del_add.rs | 4 +++- .../index_documents/extract/extract_word_position_docids.rs | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/milli/src/update/del_add.rs b/milli/src/update/del_add.rs index dc7c0409a..07a20b025 100644 --- a/milli/src/update/del_add.rs +++ b/milli/src/update/del_add.rs @@ -53,7 +53,9 @@ pub fn into_del_add_obkv( value_writer.insert(DelAdd::Addition, value)?; } value_writer.finish()?; - writer.insert(key, &value_buffer)?; + if !value_buffer.is_empty() { + writer.insert(key, &value_buffer)?; + } } writer.finish() diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index 1b9ec66ff..89b77d140 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -121,6 +121,7 @@ fn words_position_into_sorter( key } Both(key, _) => { + // both values needs to be kept because it will be used in other extractors. value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); key From d3575fb0280cb5a37cd17d1a904026080092ffe4 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 20 Nov 2023 10:53:40 +0100 Subject: [PATCH 127/127] Make into_del_add_obkv parameters more human readable --- milli/src/update/del_add.rs | 25 ++++--- milli/src/update/index_documents/transform.rs | 75 +++++++++++++------ 2 files changed, 68 insertions(+), 32 deletions(-) diff --git a/milli/src/update/del_add.rs b/milli/src/update/del_add.rs index 07a20b025..794beb5df 100644 --- a/milli/src/update/del_add.rs +++ b/milli/src/update/del_add.rs @@ -32,13 +32,12 @@ impl Key for DelAdd { /// Creates a Kv> from Kv /// -/// if deletion is `true`, the value will be inserted behind a DelAdd::Deletion key. -/// if addition is `true`, the value will be inserted behind a DelAdd::Addition key. -/// if both deletion and addition are `true, the value will be inserted in both keys. +/// Deletion: put all the values under DelAdd::Deletion +/// Addition: put all the values under DelAdd::Addition, +/// DeletionAndAddition: put all the values under DelAdd::Deletion and DelAdd::Addition, pub fn into_del_add_obkv( reader: obkv::KvReader, - deletion: bool, - addition: bool, + operation: DelAddOperation, buffer: &mut Vec, ) -> Result<(), std::io::Error> { let mut writer = obkv::KvWriter::new(buffer); @@ -46,21 +45,27 @@ pub fn into_del_add_obkv( for (key, value) in reader.iter() { value_buffer.clear(); let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); - if deletion { + if matches!(operation, DelAddOperation::Deletion | DelAddOperation::DeletionAndAddition) { value_writer.insert(DelAdd::Deletion, value)?; } - if addition { + if matches!(operation, DelAddOperation::Addition | DelAddOperation::DeletionAndAddition) { value_writer.insert(DelAdd::Addition, value)?; } value_writer.finish()?; - if !value_buffer.is_empty() { - writer.insert(key, &value_buffer)?; - } + writer.insert(key, &value_buffer)?; } writer.finish() } +/// Enum controlling the side of the DelAdd obkv in which the provided value will be written. +#[derive(Debug, Clone, Copy)] +pub enum DelAddOperation { + Deletion, + Addition, + DeletionAndAddition, +} + /// Creates a Kv> from two Kv /// /// putting each deletion obkv's keys under an DelAdd::Deletion diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 8dc88efb9..323bc3da7 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -21,7 +21,7 @@ use super::{IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::{db_name, main_key}; -use crate::update::del_add::{into_del_add_obkv, DelAdd, KvReaderDelAdd}; +use crate::update::del_add::{into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd}; use crate::update::index_documents::GrenadParameters; use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep}; use crate::{ @@ -265,8 +265,12 @@ impl<'a, 'i> Transform<'a, 'i> { skip_insertion = true; } else { // we associate the base document with the new key, everything will get merged later. - let keep_original_version = - self.index_documents_method == IndexDocumentsMethod::UpdateDocuments; + let deladd_operation = match self.index_documents_method { + IndexDocumentsMethod::UpdateDocuments => { + DelAddOperation::DeletionAndAddition + } + IndexDocumentsMethod::ReplaceDocuments => DelAddOperation::Deletion, + }; document_sorter_key_buffer.clear(); document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); document_sorter_key_buffer.extend_from_slice(external_id.as_bytes()); @@ -274,8 +278,7 @@ impl<'a, 'i> Transform<'a, 'i> { document_sorter_value_buffer.push(Operation::Addition as u8); into_del_add_obkv( KvReaderU16::new(base_obkv), - true, - keep_original_version, + deladd_operation, &mut document_sorter_value_buffer, )?; self.original_sorter @@ -287,8 +290,7 @@ impl<'a, 'i> Transform<'a, 'i> { document_sorter_value_buffer.push(Operation::Addition as u8); into_del_add_obkv( KvReaderU16::new(&flattened_obkv), - true, - keep_original_version, + deladd_operation, &mut document_sorter_value_buffer, )?; } @@ -307,8 +309,7 @@ impl<'a, 'i> Transform<'a, 'i> { document_sorter_value_buffer.push(Operation::Addition as u8); into_del_add_obkv( KvReaderU16::new(&obkv_buffer), - false, - true, + DelAddOperation::Addition, &mut document_sorter_value_buffer, )?; // We use the extracted/generated user id as the key for this document. @@ -321,8 +322,7 @@ impl<'a, 'i> Transform<'a, 'i> { document_sorter_value_buffer.push(Operation::Addition as u8); into_del_add_obkv( KvReaderU16::new(&obkv), - false, - true, + DelAddOperation::Addition, &mut document_sorter_value_buffer, )? } @@ -517,7 +517,11 @@ impl<'a, 'i> Transform<'a, 'i> { // push it as to delete in the original_sorter document_sorter_value_buffer.clear(); document_sorter_value_buffer.push(Operation::Deletion as u8); - into_del_add_obkv(KvReaderU16::new(base_obkv), true, false, document_sorter_value_buffer)?; + into_del_add_obkv( + KvReaderU16::new(base_obkv), + DelAddOperation::Deletion, + document_sorter_value_buffer, + )?; self.original_sorter.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; // flatten it and push it as to delete in the flattened_sorter @@ -526,7 +530,11 @@ impl<'a, 'i> Transform<'a, 'i> { // we recreate our buffer with the flattened documents document_sorter_value_buffer.clear(); document_sorter_value_buffer.push(Operation::Deletion as u8); - into_del_add_obkv(KvReaderU16::new(&obkv), true, false, document_sorter_value_buffer)?; + into_del_add_obkv( + KvReaderU16::new(&obkv), + DelAddOperation::Deletion, + document_sorter_value_buffer, + )?; } self.flattened_sorter .insert(internal_docid.to_be_bytes(), &document_sorter_value_buffer)?; @@ -869,8 +877,7 @@ impl<'a, 'i> Transform<'a, 'i> { document_sorter_value_buffer.clear(); into_del_add_obkv( KvReaderU16::new(buffer), - false, - true, + DelAddOperation::Addition, &mut document_sorter_value_buffer, )?; original_sorter.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; @@ -911,8 +918,7 @@ impl<'a, 'i> Transform<'a, 'i> { document_sorter_value_buffer.clear(); into_del_add_obkv( KvReaderU16::new(&buffer), - false, - true, + DelAddOperation::Addition, &mut document_sorter_value_buffer, )?; flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?; @@ -986,18 +992,38 @@ mod test { let mut kv_writer = KvWriter::memory(); kv_writer.insert(0_u8, [0]).unwrap(); let buffer = kv_writer.into_inner().unwrap(); - into_del_add_obkv(KvReaderU16::new(&buffer), false, true, &mut additive_doc_0).unwrap(); + into_del_add_obkv( + KvReaderU16::new(&buffer), + DelAddOperation::Addition, + &mut additive_doc_0, + ) + .unwrap(); additive_doc_0.insert(0, Operation::Addition as u8); - into_del_add_obkv(KvReaderU16::new(&buffer), true, false, &mut deletive_doc_0).unwrap(); + into_del_add_obkv( + KvReaderU16::new(&buffer), + DelAddOperation::Deletion, + &mut deletive_doc_0, + ) + .unwrap(); deletive_doc_0.insert(0, Operation::Deletion as u8); - into_del_add_obkv(KvReaderU16::new(&buffer), true, true, &mut del_add_doc_0).unwrap(); + into_del_add_obkv( + KvReaderU16::new(&buffer), + DelAddOperation::DeletionAndAddition, + &mut del_add_doc_0, + ) + .unwrap(); del_add_doc_0.insert(0, Operation::Addition as u8); let mut additive_doc_1 = Vec::new(); let mut kv_writer = KvWriter::memory(); kv_writer.insert(1_u8, [1]).unwrap(); let buffer = kv_writer.into_inner().unwrap(); - into_del_add_obkv(KvReaderU16::new(&buffer), false, true, &mut additive_doc_1).unwrap(); + into_del_add_obkv( + KvReaderU16::new(&buffer), + DelAddOperation::Addition, + &mut additive_doc_1, + ) + .unwrap(); additive_doc_1.insert(0, Operation::Addition as u8); let mut additive_doc_0_1 = Vec::new(); @@ -1005,7 +1031,12 @@ mod test { kv_writer.insert(0_u8, [0]).unwrap(); kv_writer.insert(1_u8, [1]).unwrap(); let buffer = kv_writer.into_inner().unwrap(); - into_del_add_obkv(KvReaderU16::new(&buffer), false, true, &mut additive_doc_0_1).unwrap(); + into_del_add_obkv( + KvReaderU16::new(&buffer), + DelAddOperation::Addition, + &mut additive_doc_0_1, + ) + .unwrap(); additive_doc_0_1.insert(0, Operation::Addition as u8); let ret = obkvs_merge_additions_and_deletions(&[], &[Cow::from(additive_doc_0.as_slice())])