mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-23 10:37:41 +08:00
Use fst 0.4.4 in the project
This commit is contained in:
parent
6c87723b19
commit
bc7b0a38fd
@ -79,12 +79,8 @@ where
|
|||||||
|
|
||||||
let mut result = SortResult::default();
|
let mut result = SortResult::default();
|
||||||
|
|
||||||
let words_set = match unsafe { main_store.static_words_fst(reader)? } {
|
let words_set = main_store.words_fst(reader)?;
|
||||||
Some(words) => words,
|
let stop_words = main_store.stop_words_fst(reader)?;
|
||||||
None => return Ok(SortResult::default()),
|
|
||||||
};
|
|
||||||
|
|
||||||
let stop_words = main_store.stop_words_fst(reader)?.unwrap_or_default();
|
|
||||||
|
|
||||||
let context = QTContext {
|
let context = QTContext {
|
||||||
words_set,
|
words_set,
|
||||||
@ -230,12 +226,8 @@ where
|
|||||||
{
|
{
|
||||||
let mut result = SortResult::default();
|
let mut result = SortResult::default();
|
||||||
|
|
||||||
let words_set = match unsafe { main_store.static_words_fst(reader)? } {
|
let words_set = main_store.words_fst(reader)?;
|
||||||
Some(words) => words,
|
let stop_words = main_store.stop_words_fst(reader)?;
|
||||||
None => return Ok(SortResult::default()),
|
|
||||||
};
|
|
||||||
|
|
||||||
let stop_words = main_store.stop_words_fst(reader)?.unwrap_or_default();
|
|
||||||
|
|
||||||
let context = QTContext {
|
let context = QTContext {
|
||||||
words_set,
|
words_set,
|
||||||
|
@ -38,16 +38,20 @@ pub use meilisearch_types::{DocIndex, DocumentId, Highlight};
|
|||||||
pub use meilisearch_schema::Schema;
|
pub use meilisearch_schema::Schema;
|
||||||
pub use query_words_mapper::QueryWordsMapper;
|
pub use query_words_mapper::QueryWordsMapper;
|
||||||
|
|
||||||
use std::convert::TryFrom;
|
|
||||||
use std::collections::HashMap;
|
|
||||||
use compact_arena::SmallArena;
|
use compact_arena::SmallArena;
|
||||||
use log::{error, trace};
|
use log::{error, trace};
|
||||||
|
use std::borrow::Cow;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::convert::TryFrom;
|
||||||
|
|
||||||
use crate::bucket_sort::PostingsListView;
|
use crate::bucket_sort::PostingsListView;
|
||||||
use crate::levenshtein::prefix_damerau_levenshtein;
|
use crate::levenshtein::prefix_damerau_levenshtein;
|
||||||
use crate::query_tree::{QueryId, QueryKind};
|
use crate::query_tree::{QueryId, QueryKind};
|
||||||
use crate::reordered_attrs::ReorderedAttrs;
|
use crate::reordered_attrs::ReorderedAttrs;
|
||||||
|
|
||||||
|
type FstSetCow<'a> = fst::Set<Cow<'a, [u8]>>;
|
||||||
|
type FstMapCow<'a> = fst::Map<Cow<'a, [u8]>>;
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
|
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
|
||||||
pub struct Document {
|
pub struct Document {
|
||||||
pub id: DocumentId,
|
pub id: DocumentId,
|
||||||
|
@ -186,7 +186,7 @@ mod tests {
|
|||||||
use std::collections::{BTreeSet, HashMap};
|
use std::collections::{BTreeSet, HashMap};
|
||||||
use std::iter::FromIterator;
|
use std::iter::FromIterator;
|
||||||
|
|
||||||
use fst::{IntoStreamer, Set};
|
use fst::IntoStreamer;
|
||||||
use meilisearch_schema::IndexedPos;
|
use meilisearch_schema::IndexedPos;
|
||||||
use sdset::SetBuf;
|
use sdset::SetBuf;
|
||||||
use tempfile::TempDir;
|
use tempfile::TempDir;
|
||||||
@ -199,21 +199,21 @@ mod tests {
|
|||||||
use crate::store::Index;
|
use crate::store::Index;
|
||||||
use meilisearch_schema::Schema;
|
use meilisearch_schema::Schema;
|
||||||
|
|
||||||
fn set_from_stream<'f, I, S>(stream: I) -> Set
|
fn set_from_stream<'f, I, S>(stream: I) -> fst::Set<Vec<u8>>
|
||||||
where
|
where
|
||||||
I: for<'a> fst::IntoStreamer<'a, Into = S, Item = &'a [u8]>,
|
I: for<'a> fst::IntoStreamer<'a, Into = S, Item = &'a [u8]>,
|
||||||
S: 'f + for<'a> fst::Streamer<'a, Item = &'a [u8]>,
|
S: 'f + for<'a> fst::Streamer<'a, Item = &'a [u8]>,
|
||||||
{
|
{
|
||||||
let mut builder = fst::SetBuilder::memory();
|
let mut builder = fst::SetBuilder::memory();
|
||||||
builder.extend_stream(stream).unwrap();
|
builder.extend_stream(stream).unwrap();
|
||||||
builder.into_inner().and_then(Set::from_bytes).unwrap()
|
builder.into_set()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn insert_key(set: &Set, key: &[u8]) -> Set {
|
fn insert_key<A: AsRef<[u8]>>(set: &fst::Set<A>, key: &[u8]) -> fst::Set<Vec<u8>> {
|
||||||
let unique_key = {
|
let unique_key = {
|
||||||
let mut builder = fst::SetBuilder::memory();
|
let mut builder = fst::SetBuilder::memory();
|
||||||
builder.insert(key).unwrap();
|
builder.insert(key).unwrap();
|
||||||
builder.into_inner().and_then(Set::from_bytes).unwrap()
|
builder.into_set()
|
||||||
};
|
};
|
||||||
|
|
||||||
let union_ = set.op().add(unique_key.into_stream()).r#union();
|
let union_ = set.op().add(unique_key.into_stream()).r#union();
|
||||||
@ -221,11 +221,11 @@ mod tests {
|
|||||||
set_from_stream(union_)
|
set_from_stream(union_)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn sdset_into_fstset(set: &sdset::Set<&str>) -> Set {
|
fn sdset_into_fstset(set: &sdset::Set<&str>) -> fst::Set<Vec<u8>> {
|
||||||
let mut builder = fst::SetBuilder::memory();
|
let mut builder = fst::SetBuilder::memory();
|
||||||
let set = SetBuf::from_dirty(set.into_iter().map(|s| normalize_str(s)).collect());
|
let set = SetBuf::from_dirty(set.into_iter().map(|s| normalize_str(s)).collect());
|
||||||
builder.extend_iter(set.into_iter()).unwrap();
|
builder.extend_iter(set.into_iter()).unwrap();
|
||||||
builder.into_inner().and_then(Set::from_bytes).unwrap()
|
builder.into_set()
|
||||||
}
|
}
|
||||||
|
|
||||||
const fn doc_index(document_id: u32, word_index: u16) -> DocIndex {
|
const fn doc_index(document_id: u32, word_index: u16) -> DocIndex {
|
||||||
@ -265,15 +265,11 @@ mod tests {
|
|||||||
|
|
||||||
let word = normalize_str(word);
|
let word = normalize_str(word);
|
||||||
|
|
||||||
let alternatives = match self
|
let alternatives = self
|
||||||
.index
|
.index
|
||||||
.synonyms
|
.synonyms
|
||||||
.synonyms(&writer, word.as_bytes())
|
.synonyms(&writer, word.as_bytes())
|
||||||
.unwrap()
|
.unwrap();
|
||||||
{
|
|
||||||
Some(alternatives) => alternatives,
|
|
||||||
None => fst::Set::default(),
|
|
||||||
};
|
|
||||||
|
|
||||||
let new = sdset_into_fstset(&new);
|
let new = sdset_into_fstset(&new);
|
||||||
let new_alternatives =
|
let new_alternatives =
|
||||||
@ -283,10 +279,7 @@ mod tests {
|
|||||||
.put_synonyms(&mut writer, word.as_bytes(), &new_alternatives)
|
.put_synonyms(&mut writer, word.as_bytes(), &new_alternatives)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let synonyms = match self.index.main.synonyms_fst(&writer).unwrap() {
|
let synonyms = self.index.main.synonyms_fst(&writer).unwrap();
|
||||||
Some(synonyms) => synonyms,
|
|
||||||
None => fst::Set::default(),
|
|
||||||
};
|
|
||||||
|
|
||||||
let synonyms_fst = insert_key(&synonyms, word.as_bytes());
|
let synonyms_fst = insert_key(&synonyms, word.as_bytes());
|
||||||
self.index
|
self.index
|
||||||
@ -339,7 +332,7 @@ mod tests {
|
|||||||
|
|
||||||
index.main.put_schema(&mut writer, &schema).unwrap();
|
index.main.put_schema(&mut writer, &schema).unwrap();
|
||||||
|
|
||||||
let words_fst = Set::from_iter(words_fst).unwrap();
|
let words_fst = fst::Set::from_iter(words_fst).unwrap();
|
||||||
|
|
||||||
index.main.put_words_fst(&mut writer, &words_fst).unwrap();
|
index.main.put_words_fst(&mut writer, &words_fst).unwrap();
|
||||||
|
|
||||||
|
@ -12,7 +12,7 @@ use sdset::{Set, SetBuf, SetOperation};
|
|||||||
use log::debug;
|
use log::debug;
|
||||||
|
|
||||||
use crate::database::MainT;
|
use crate::database::MainT;
|
||||||
use crate::{store, DocumentId, DocIndex, MResult};
|
use crate::{store, DocumentId, DocIndex, MResult, FstSetCow};
|
||||||
use crate::automaton::{normalize_str, build_dfa, build_prefix_dfa, build_exact_dfa};
|
use crate::automaton::{normalize_str, build_dfa, build_prefix_dfa, build_exact_dfa};
|
||||||
use crate::QueryWordsMapper;
|
use crate::QueryWordsMapper;
|
||||||
|
|
||||||
@ -112,9 +112,9 @@ pub struct PostingsList {
|
|||||||
matches: SetBuf<DocIndex>,
|
matches: SetBuf<DocIndex>,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct Context {
|
pub struct Context<'a> {
|
||||||
pub words_set: fst::Set,
|
pub words_set: FstSetCow<'a>,
|
||||||
pub stop_words: fst::Set,
|
pub stop_words: FstSetCow<'a>,
|
||||||
pub synonyms: store::Synonyms,
|
pub synonyms: store::Synonyms,
|
||||||
pub postings_lists: store::PostingsLists,
|
pub postings_lists: store::PostingsLists,
|
||||||
pub prefix_postings_lists: store::PrefixPostingsListsCache,
|
pub prefix_postings_lists: store::PrefixPostingsListsCache,
|
||||||
@ -147,7 +147,7 @@ fn split_best_frequency<'a>(reader: &heed::RoTxn<MainT>, ctx: &Context, word: &'
|
|||||||
|
|
||||||
fn fetch_synonyms(reader: &heed::RoTxn<MainT>, ctx: &Context, words: &[&str]) -> MResult<Vec<Vec<String>>> {
|
fn fetch_synonyms(reader: &heed::RoTxn<MainT>, ctx: &Context, words: &[&str]) -> MResult<Vec<Vec<String>>> {
|
||||||
let words = normalize_str(&words.join(" "));
|
let words = normalize_str(&words.join(" "));
|
||||||
let set = ctx.synonyms.synonyms(reader, words.as_bytes())?.unwrap_or_default();
|
let set = ctx.synonyms.synonyms(reader, words.as_bytes())?;
|
||||||
|
|
||||||
let mut strings = Vec::new();
|
let mut strings = Vec::new();
|
||||||
let mut stream = set.stream();
|
let mut stream = set.stream();
|
||||||
|
@ -1,34 +1,37 @@
|
|||||||
|
use std::borrow::Cow;
|
||||||
use std::collections::{BTreeMap, HashMap};
|
use std::collections::{BTreeMap, HashMap};
|
||||||
use std::convert::TryFrom;
|
use std::convert::TryFrom;
|
||||||
|
|
||||||
use crate::{DocIndex, DocumentId};
|
|
||||||
use deunicode::deunicode_with_tofu;
|
use deunicode::deunicode_with_tofu;
|
||||||
use meilisearch_schema::IndexedPos;
|
use meilisearch_schema::IndexedPos;
|
||||||
use meilisearch_tokenizer::{is_cjk, SeqTokenizer, Token, Tokenizer};
|
use meilisearch_tokenizer::{is_cjk, SeqTokenizer, Token, Tokenizer};
|
||||||
use sdset::SetBuf;
|
use sdset::SetBuf;
|
||||||
|
|
||||||
|
use crate::{DocIndex, DocumentId};
|
||||||
|
use crate::FstSetCow;
|
||||||
|
|
||||||
const WORD_LENGTH_LIMIT: usize = 80;
|
const WORD_LENGTH_LIMIT: usize = 80;
|
||||||
|
|
||||||
type Word = Vec<u8>; // TODO make it be a SmallVec
|
type Word = Vec<u8>; // TODO make it be a SmallVec
|
||||||
|
|
||||||
pub struct RawIndexer {
|
pub struct RawIndexer<A> {
|
||||||
word_limit: usize, // the maximum number of indexed words
|
word_limit: usize, // the maximum number of indexed words
|
||||||
stop_words: fst::Set,
|
stop_words: fst::Set<A>,
|
||||||
words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
|
words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
|
||||||
docs_words: HashMap<DocumentId, Vec<Word>>,
|
docs_words: HashMap<DocumentId, Vec<Word>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct Indexed {
|
pub struct Indexed<'a> {
|
||||||
pub words_doc_indexes: BTreeMap<Word, SetBuf<DocIndex>>,
|
pub words_doc_indexes: BTreeMap<Word, SetBuf<DocIndex>>,
|
||||||
pub docs_words: HashMap<DocumentId, fst::Set>,
|
pub docs_words: HashMap<DocumentId, FstSetCow<'a>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl RawIndexer {
|
impl<A> RawIndexer<A> {
|
||||||
pub fn new(stop_words: fst::Set) -> RawIndexer {
|
pub fn new(stop_words: fst::Set<A>) -> RawIndexer<A> {
|
||||||
RawIndexer::with_word_limit(stop_words, 1000)
|
RawIndexer::with_word_limit(stop_words, 1000)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn with_word_limit(stop_words: fst::Set, limit: usize) -> RawIndexer {
|
pub fn with_word_limit(stop_words: fst::Set<A>, limit: usize) -> RawIndexer<A> {
|
||||||
RawIndexer {
|
RawIndexer {
|
||||||
word_limit: limit,
|
word_limit: limit,
|
||||||
stop_words,
|
stop_words,
|
||||||
@ -36,7 +39,9 @@ impl RawIndexer {
|
|||||||
docs_words: HashMap::new(),
|
docs_words: HashMap::new(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<A: AsRef<[u8]>> RawIndexer<A> {
|
||||||
pub fn index_text(&mut self, id: DocumentId, indexed_pos: IndexedPos, text: &str) -> usize {
|
pub fn index_text(&mut self, id: DocumentId, indexed_pos: IndexedPos, text: &str) -> usize {
|
||||||
let mut number_of_words = 0;
|
let mut number_of_words = 0;
|
||||||
|
|
||||||
@ -61,9 +66,9 @@ impl RawIndexer {
|
|||||||
number_of_words
|
number_of_words
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn index_text_seq<'a, I>(&mut self, id: DocumentId, indexed_pos: IndexedPos, iter: I)
|
pub fn index_text_seq<'s, I>(&mut self, id: DocumentId, indexed_pos: IndexedPos, iter: I)
|
||||||
where
|
where
|
||||||
I: IntoIterator<Item = &'a str>,
|
I: IntoIterator<Item = &'s str>,
|
||||||
{
|
{
|
||||||
let iter = iter.into_iter();
|
let iter = iter.into_iter();
|
||||||
for token in SeqTokenizer::new(iter) {
|
for token in SeqTokenizer::new(iter) {
|
||||||
@ -83,7 +88,7 @@ impl RawIndexer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn build(self) -> Indexed {
|
pub fn build(self) -> Indexed<'static> {
|
||||||
let words_doc_indexes = self
|
let words_doc_indexes = self
|
||||||
.words_doc_indexes
|
.words_doc_indexes
|
||||||
.into_iter()
|
.into_iter()
|
||||||
@ -96,7 +101,8 @@ impl RawIndexer {
|
|||||||
.map(|(id, mut words)| {
|
.map(|(id, mut words)| {
|
||||||
words.sort_unstable();
|
words.sort_unstable();
|
||||||
words.dedup();
|
words.dedup();
|
||||||
(id, fst::Set::from_iter(words).unwrap())
|
let fst = fst::Set::from_iter(words).unwrap().map_data(Cow::Owned).unwrap();
|
||||||
|
(id, fst)
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
@ -107,15 +113,17 @@ impl RawIndexer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn index_token(
|
fn index_token<A>(
|
||||||
token: Token,
|
token: Token,
|
||||||
id: DocumentId,
|
id: DocumentId,
|
||||||
indexed_pos: IndexedPos,
|
indexed_pos: IndexedPos,
|
||||||
word_limit: usize,
|
word_limit: usize,
|
||||||
stop_words: &fst::Set,
|
stop_words: &fst::Set<A>,
|
||||||
words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
|
words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
|
||||||
docs_words: &mut HashMap<DocumentId, Vec<Word>>,
|
docs_words: &mut HashMap<DocumentId, Vec<Word>>,
|
||||||
) -> bool {
|
) -> bool
|
||||||
|
where A: AsRef<[u8]>,
|
||||||
|
{
|
||||||
if token.word_index >= word_limit {
|
if token.word_index >= word_limit {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -1,9 +1,11 @@
|
|||||||
use super::BEU32;
|
use std::borrow::Cow;
|
||||||
use crate::database::MainT;
|
|
||||||
use crate::DocumentId;
|
|
||||||
use heed::types::{ByteSlice, OwnedType};
|
|
||||||
use heed::Result as ZResult;
|
use heed::Result as ZResult;
|
||||||
use std::sync::Arc;
|
use heed::types::{ByteSlice, OwnedType};
|
||||||
|
|
||||||
|
use crate::database::MainT;
|
||||||
|
use crate::{DocumentId, FstSetCow};
|
||||||
|
use super::BEU32;
|
||||||
|
|
||||||
#[derive(Copy, Clone)]
|
#[derive(Copy, Clone)]
|
||||||
pub struct DocsWords {
|
pub struct DocsWords {
|
||||||
@ -15,7 +17,7 @@ impl DocsWords {
|
|||||||
self,
|
self,
|
||||||
writer: &mut heed::RwTxn<MainT>,
|
writer: &mut heed::RwTxn<MainT>,
|
||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
words: &fst::Set,
|
words: &FstSetCow,
|
||||||
) -> ZResult<()> {
|
) -> ZResult<()> {
|
||||||
let document_id = BEU32::new(document_id.0);
|
let document_id = BEU32::new(document_id.0);
|
||||||
let bytes = words.as_fst().as_bytes();
|
let bytes = words.as_fst().as_bytes();
|
||||||
@ -31,20 +33,11 @@ impl DocsWords {
|
|||||||
self.docs_words.clear(writer)
|
self.docs_words.clear(writer)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn doc_words(
|
pub fn doc_words(self, reader: &heed::RoTxn<MainT>, document_id: DocumentId) -> ZResult<FstSetCow> {
|
||||||
self,
|
|
||||||
reader: &heed::RoTxn<MainT>,
|
|
||||||
document_id: DocumentId,
|
|
||||||
) -> ZResult<Option<fst::Set>> {
|
|
||||||
let document_id = BEU32::new(document_id.0);
|
let document_id = BEU32::new(document_id.0);
|
||||||
match self.docs_words.get(reader, &document_id)? {
|
match self.docs_words.get(reader, &document_id)? {
|
||||||
Some(bytes) => {
|
Some(bytes) => Ok(fst::Set::new(bytes).unwrap().map_data(Cow::Borrowed).unwrap()),
|
||||||
let len = bytes.len();
|
None => Ok(fst::Set::default().map_data(Cow::Owned).unwrap()),
|
||||||
let bytes = Arc::new(bytes.to_owned());
|
|
||||||
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
|
|
||||||
Ok(Some(fst::Set::from(fst)))
|
|
||||||
}
|
|
||||||
None => Ok(None),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::sync::Arc;
|
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
|
||||||
use chrono::{DateTime, Utc};
|
use chrono::{DateTime, Utc};
|
||||||
@ -12,6 +11,7 @@ use sdset::Set;
|
|||||||
use crate::database::MainT;
|
use crate::database::MainT;
|
||||||
use crate::RankedMap;
|
use crate::RankedMap;
|
||||||
use crate::settings::RankingRule;
|
use crate::settings::RankingRule;
|
||||||
|
use crate::{FstSetCow, FstMapCow};
|
||||||
use super::{CowSet, DocumentsIds};
|
use super::{CowSet, DocumentsIds};
|
||||||
|
|
||||||
const ATTRIBUTES_FOR_FACETING_KEY: &str = "attributes-for-faceting";
|
const ATTRIBUTES_FOR_FACETING_KEY: &str = "attributes-for-faceting";
|
||||||
@ -103,11 +103,15 @@ impl Main {
|
|||||||
self.put_internal_docids(writer, &internal_docids)
|
self.put_internal_docids(writer, &internal_docids)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn put_external_docids(self, writer: &mut heed::RwTxn<MainT>, ids: &fst::Map) -> ZResult<()> {
|
pub fn put_external_docids<A>(self, writer: &mut heed::RwTxn<MainT>, ids: &fst::Map<A>) -> ZResult<()>
|
||||||
|
where A: AsRef<[u8]>,
|
||||||
|
{
|
||||||
self.main.put::<_, Str, ByteSlice>(writer, EXTERNAL_DOCIDS_KEY, ids.as_fst().as_bytes())
|
self.main.put::<_, Str, ByteSlice>(writer, EXTERNAL_DOCIDS_KEY, ids.as_fst().as_bytes())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn merge_external_docids(self, writer: &mut heed::RwTxn<MainT>, new_docids: &fst::Map) -> ZResult<()> {
|
pub fn merge_external_docids<A>(self, writer: &mut heed::RwTxn<MainT>, new_docids: &fst::Map<A>) -> ZResult<()>
|
||||||
|
where A: AsRef<[u8]>,
|
||||||
|
{
|
||||||
use fst::{Streamer, IntoStreamer};
|
use fst::{Streamer, IntoStreamer};
|
||||||
|
|
||||||
// Do an union of the old and the new set of external docids.
|
// Do an union of the old and the new set of external docids.
|
||||||
@ -117,13 +121,15 @@ impl Main {
|
|||||||
while let Some((docid, values)) = op.next() {
|
while let Some((docid, values)) = op.next() {
|
||||||
build.insert(docid, values[0].value).unwrap();
|
build.insert(docid, values[0].value).unwrap();
|
||||||
}
|
}
|
||||||
let external_docids = build.into_inner().unwrap();
|
drop(op);
|
||||||
|
|
||||||
// TODO prefer using self.put_user_ids
|
let external_docids = build.into_map();
|
||||||
self.main.put::<_, Str, ByteSlice>(writer, EXTERNAL_DOCIDS_KEY, external_docids.as_slice())
|
self.put_external_docids(writer, &external_docids)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn remove_external_docids(self, writer: &mut heed::RwTxn<MainT>, ids: &fst::Map) -> ZResult<()> {
|
pub fn remove_external_docids<A>(self, writer: &mut heed::RwTxn<MainT>, ids: &fst::Map<A>) -> ZResult<()>
|
||||||
|
where A: AsRef<[u8]>,
|
||||||
|
{
|
||||||
use fst::{Streamer, IntoStreamer};
|
use fst::{Streamer, IntoStreamer};
|
||||||
|
|
||||||
// Do an union of the old and the new set of external docids.
|
// Do an union of the old and the new set of external docids.
|
||||||
@ -133,21 +139,16 @@ impl Main {
|
|||||||
while let Some((docid, values)) = op.next() {
|
while let Some((docid, values)) = op.next() {
|
||||||
build.insert(docid, values[0].value).unwrap();
|
build.insert(docid, values[0].value).unwrap();
|
||||||
}
|
}
|
||||||
let external_docids = build.into_inner().unwrap();
|
drop(op);
|
||||||
|
|
||||||
// TODO prefer using self.put_external_docids
|
let external_docids = build.into_map();
|
||||||
self.main.put::<_, Str, ByteSlice>(writer, EXTERNAL_DOCIDS_KEY, external_docids.as_slice())
|
self.put_external_docids(writer, &external_docids)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn external_docids(self, reader: &heed::RoTxn<MainT>) -> ZResult<fst::Map> {
|
pub fn external_docids(self, reader: &heed::RoTxn<MainT>) -> ZResult<FstMapCow> {
|
||||||
match self.main.get::<_, Str, ByteSlice>(reader, EXTERNAL_DOCIDS_KEY)? {
|
match self.main.get::<_, Str, ByteSlice>(reader, EXTERNAL_DOCIDS_KEY)? {
|
||||||
Some(bytes) => {
|
Some(bytes) => Ok(fst::Map::new(bytes).unwrap().map_data(Cow::Borrowed).unwrap()),
|
||||||
let len = bytes.len();
|
None => Ok(fst::Map::default().map_data(Cow::Owned).unwrap()),
|
||||||
let bytes = Arc::new(bytes.to_owned());
|
|
||||||
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
|
|
||||||
Ok(fst::Map::from(fst))
|
|
||||||
},
|
|
||||||
None => Ok(fst::Map::default()),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -156,30 +157,14 @@ impl Main {
|
|||||||
Ok(external_ids.get(external_docid).map(|id| DocumentId(id as u32)))
|
Ok(external_ids.get(external_docid).map(|id| DocumentId(id as u32)))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn put_words_fst(self, writer: &mut heed::RwTxn<MainT>, fst: &fst::Set) -> ZResult<()> {
|
pub fn put_words_fst<A: AsRef<[u8]>>(self, writer: &mut heed::RwTxn<MainT>, fst: &fst::Set<A>) -> ZResult<()> {
|
||||||
self.main.put::<_, Str, ByteSlice>(writer, WORDS_KEY, fst.as_fst().as_bytes())
|
self.main.put::<_, Str, ByteSlice>(writer, WORDS_KEY, fst.as_fst().as_bytes())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub unsafe fn static_words_fst(self, reader: &heed::RoTxn<MainT>) -> ZResult<Option<fst::Set>> {
|
pub fn words_fst(self, reader: &heed::RoTxn<MainT>) -> ZResult<FstSetCow> {
|
||||||
match self.main.get::<_, Str, ByteSlice>(reader, WORDS_KEY)? {
|
match self.main.get::<_, Str, ByteSlice>(reader, WORDS_KEY)? {
|
||||||
Some(bytes) => {
|
Some(bytes) => Ok(fst::Set::new(bytes).unwrap().map_data(Cow::Borrowed).unwrap()),
|
||||||
let bytes: &'static [u8] = std::mem::transmute(bytes);
|
None => Ok(fst::Set::default().map_data(Cow::Owned).unwrap()),
|
||||||
let set = fst::Set::from_static_slice(bytes).unwrap();
|
|
||||||
Ok(Some(set))
|
|
||||||
},
|
|
||||||
None => Ok(None),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn words_fst(self, reader: &heed::RoTxn<MainT>) -> ZResult<Option<fst::Set>> {
|
|
||||||
match self.main.get::<_, Str, ByteSlice>(reader, WORDS_KEY)? {
|
|
||||||
Some(bytes) => {
|
|
||||||
let len = bytes.len();
|
|
||||||
let bytes = Arc::new(bytes.to_owned());
|
|
||||||
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
|
|
||||||
Ok(Some(fst::Set::from(fst)))
|
|
||||||
},
|
|
||||||
None => Ok(None),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -203,37 +188,27 @@ impl Main {
|
|||||||
self.main.get::<_, Str, SerdeBincode<RankedMap>>(reader, RANKED_MAP_KEY)
|
self.main.get::<_, Str, SerdeBincode<RankedMap>>(reader, RANKED_MAP_KEY)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn put_synonyms_fst(self, writer: &mut heed::RwTxn<MainT>, fst: &fst::Set) -> ZResult<()> {
|
pub fn put_synonyms_fst<A: AsRef<[u8]>>(self, writer: &mut heed::RwTxn<MainT>, fst: &fst::Set<A>) -> ZResult<()> {
|
||||||
let bytes = fst.as_fst().as_bytes();
|
let bytes = fst.as_fst().as_bytes();
|
||||||
self.main.put::<_, Str, ByteSlice>(writer, SYNONYMS_KEY, bytes)
|
self.main.put::<_, Str, ByteSlice>(writer, SYNONYMS_KEY, bytes)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn synonyms_fst(self, reader: &heed::RoTxn<MainT>) -> ZResult<Option<fst::Set>> {
|
pub fn synonyms_fst(self, reader: &heed::RoTxn<MainT>) -> ZResult<FstSetCow> {
|
||||||
match self.main.get::<_, Str, ByteSlice>(reader, SYNONYMS_KEY)? {
|
match self.main.get::<_, Str, ByteSlice>(reader, SYNONYMS_KEY)? {
|
||||||
Some(bytes) => {
|
Some(bytes) => Ok(fst::Set::new(bytes).unwrap().map_data(Cow::Borrowed).unwrap()),
|
||||||
let len = bytes.len();
|
None => Ok(fst::Set::default().map_data(Cow::Owned).unwrap()),
|
||||||
let bytes = Arc::new(bytes.to_owned());
|
|
||||||
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
|
|
||||||
Ok(Some(fst::Set::from(fst)))
|
|
||||||
}
|
|
||||||
None => Ok(None),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn put_stop_words_fst(self, writer: &mut heed::RwTxn<MainT>, fst: &fst::Set) -> ZResult<()> {
|
pub fn put_stop_words_fst<A: AsRef<[u8]>>(self, writer: &mut heed::RwTxn<MainT>, fst: &fst::Set<A>) -> ZResult<()> {
|
||||||
let bytes = fst.as_fst().as_bytes();
|
let bytes = fst.as_fst().as_bytes();
|
||||||
self.main.put::<_, Str, ByteSlice>(writer, STOP_WORDS_KEY, bytes)
|
self.main.put::<_, Str, ByteSlice>(writer, STOP_WORDS_KEY, bytes)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn stop_words_fst(self, reader: &heed::RoTxn<MainT>) -> ZResult<Option<fst::Set>> {
|
pub fn stop_words_fst(self, reader: &heed::RoTxn<MainT>) -> ZResult<FstSetCow> {
|
||||||
match self.main.get::<_, Str, ByteSlice>(reader, STOP_WORDS_KEY)? {
|
match self.main.get::<_, Str, ByteSlice>(reader, STOP_WORDS_KEY)? {
|
||||||
Some(bytes) => {
|
Some(bytes) => Ok(fst::Set::new(bytes).unwrap().map_data(Cow::Borrowed).unwrap()),
|
||||||
let len = bytes.len();
|
None => Ok(fst::Set::default().map_data(Cow::Owned).unwrap()),
|
||||||
let bytes = Arc::new(bytes.to_owned());
|
|
||||||
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
|
|
||||||
Ok(Some(fst::Set::from(fst)))
|
|
||||||
}
|
|
||||||
None => Ok(None),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,7 +1,10 @@
|
|||||||
use heed::types::ByteSlice;
|
use std::borrow::Cow;
|
||||||
use crate::database::MainT;
|
|
||||||
use heed::Result as ZResult;
|
use heed::Result as ZResult;
|
||||||
use std::sync::Arc;
|
use heed::types::ByteSlice;
|
||||||
|
|
||||||
|
use crate::database::MainT;
|
||||||
|
use crate::FstSetCow;
|
||||||
|
|
||||||
#[derive(Copy, Clone)]
|
#[derive(Copy, Clone)]
|
||||||
pub struct Synonyms {
|
pub struct Synonyms {
|
||||||
@ -9,12 +12,9 @@ pub struct Synonyms {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Synonyms {
|
impl Synonyms {
|
||||||
pub fn put_synonyms(
|
pub fn put_synonyms<A>(self, writer: &mut heed::RwTxn<MainT>, word: &[u8], synonyms: &fst::Set<A>) -> ZResult<()>
|
||||||
self,
|
where A: AsRef<[u8]>,
|
||||||
writer: &mut heed::RwTxn<MainT>,
|
{
|
||||||
word: &[u8],
|
|
||||||
synonyms: &fst::Set,
|
|
||||||
) -> ZResult<()> {
|
|
||||||
let bytes = synonyms.as_fst().as_bytes();
|
let bytes = synonyms.as_fst().as_bytes();
|
||||||
self.synonyms.put(writer, word, bytes)
|
self.synonyms.put(writer, word, bytes)
|
||||||
}
|
}
|
||||||
@ -27,15 +27,10 @@ impl Synonyms {
|
|||||||
self.synonyms.clear(writer)
|
self.synonyms.clear(writer)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn synonyms(self, reader: &heed::RoTxn<MainT>, word: &[u8]) -> ZResult<Option<fst::Set>> {
|
pub fn synonyms<'txn>(self, reader: &'txn heed::RoTxn<MainT>, word: &[u8]) -> ZResult<FstSetCow<'txn>> {
|
||||||
match self.synonyms.get(reader, word)? {
|
match self.synonyms.get(reader, word)? {
|
||||||
Some(bytes) => {
|
Some(bytes) => Ok(fst::Set::new(bytes).unwrap().map_data(Cow::Borrowed).unwrap()),
|
||||||
let len = bytes.len();
|
None => Ok(fst::Set::default().map_data(Cow::Owned).unwrap()),
|
||||||
let bytes = Arc::new(bytes.to_owned());
|
|
||||||
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
|
|
||||||
Ok(Some(fst::Set::from(fst)))
|
|
||||||
}
|
|
||||||
None => Ok(None),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
use std::borrow::Cow;
|
||||||
use std::collections::{HashMap, BTreeMap};
|
use std::collections::{HashMap, BTreeMap};
|
||||||
|
|
||||||
use fst::{set::OpBuilder, SetBuilder};
|
use fst::{set::OpBuilder, SetBuilder};
|
||||||
@ -108,17 +109,18 @@ pub fn push_documents_addition<D: serde::Serialize>(
|
|||||||
Ok(last_update_id)
|
Ok(last_update_id)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn index_document(
|
fn index_document<A>(
|
||||||
writer: &mut heed::RwTxn<MainT>,
|
writer: &mut heed::RwTxn<MainT>,
|
||||||
documents_fields: DocumentsFields,
|
documents_fields: DocumentsFields,
|
||||||
documents_fields_counts: DocumentsFieldsCounts,
|
documents_fields_counts: DocumentsFieldsCounts,
|
||||||
ranked_map: &mut RankedMap,
|
ranked_map: &mut RankedMap,
|
||||||
indexer: &mut RawIndexer,
|
indexer: &mut RawIndexer<A>,
|
||||||
schema: &Schema,
|
schema: &Schema,
|
||||||
field_id: FieldId,
|
field_id: FieldId,
|
||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
value: &Value,
|
value: &Value,
|
||||||
) -> MResult<()>
|
) -> MResult<()>
|
||||||
|
where A: AsRef<[u8]>,
|
||||||
{
|
{
|
||||||
let serialized = serde_json::to_vec(value)?;
|
let serialized = serde_json::to_vec(value)?;
|
||||||
documents_fields.put_document_field(writer, document_id, field_id, &serialized)?;
|
documents_fields.put_document_field(writer, document_id, field_id, &serialized)?;
|
||||||
@ -208,10 +210,7 @@ pub fn apply_addition<'a, 'b>(
|
|||||||
None => RankedMap::default(),
|
None => RankedMap::default(),
|
||||||
};
|
};
|
||||||
|
|
||||||
let stop_words = match index.main.stop_words_fst(writer)? {
|
let stop_words = index.main.stop_words_fst(writer)?.map_data(Cow::into_owned)?;
|
||||||
Some(stop_words) => stop_words,
|
|
||||||
None => fst::Set::default(),
|
|
||||||
};
|
|
||||||
|
|
||||||
// 3. index the documents fields in the stores
|
// 3. index the documents fields in the stores
|
||||||
if let Some(attributes_for_facetting) = index.main.attributes_for_faceting(writer)? {
|
if let Some(attributes_for_facetting) = index.main.attributes_for_faceting(writer)? {
|
||||||
@ -297,10 +296,10 @@ pub fn reindex_all_documents(writer: &mut heed::RwTxn<MainT>, index: &store::Ind
|
|||||||
index.postings_lists.clear(writer)?;
|
index.postings_lists.clear(writer)?;
|
||||||
index.docs_words.clear(writer)?;
|
index.docs_words.clear(writer)?;
|
||||||
|
|
||||||
let stop_words = match index.main.stop_words_fst(writer)? {
|
let stop_words = index.main
|
||||||
Some(stop_words) => stop_words,
|
.stop_words_fst(writer)?
|
||||||
None => fst::Set::default(),
|
.map_data(Cow::into_owned)
|
||||||
};
|
.unwrap();
|
||||||
|
|
||||||
let number_of_inserted_documents = documents_ids_to_reindex.len();
|
let number_of_inserted_documents = documents_ids_to_reindex.len();
|
||||||
let mut indexer = RawIndexer::new(stop_words);
|
let mut indexer = RawIndexer::new(stop_words);
|
||||||
@ -348,13 +347,15 @@ pub fn reindex_all_documents(writer: &mut heed::RwTxn<MainT>, index: &store::Ind
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn write_documents_addition_index(
|
pub fn write_documents_addition_index<A>(
|
||||||
writer: &mut heed::RwTxn<MainT>,
|
writer: &mut heed::RwTxn<MainT>,
|
||||||
index: &store::Index,
|
index: &store::Index,
|
||||||
ranked_map: &RankedMap,
|
ranked_map: &RankedMap,
|
||||||
number_of_inserted_documents: usize,
|
number_of_inserted_documents: usize,
|
||||||
indexer: RawIndexer,
|
indexer: RawIndexer<A>,
|
||||||
) -> MResult<()> {
|
) -> MResult<()>
|
||||||
|
where A: AsRef<[u8]>,
|
||||||
|
{
|
||||||
let indexed = indexer.build();
|
let indexed = indexer.build();
|
||||||
let mut delta_words_builder = SetBuilder::memory();
|
let mut delta_words_builder = SetBuilder::memory();
|
||||||
|
|
||||||
@ -373,33 +374,27 @@ pub fn write_documents_addition_index(
|
|||||||
index.docs_words.put_doc_words(writer, id, &words)?;
|
index.docs_words.put_doc_words(writer, id, &words)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let delta_words = delta_words_builder
|
let delta_words = delta_words_builder.into_set();
|
||||||
.into_inner()
|
|
||||||
.and_then(fst::Set::from_bytes)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let words = match index.main.words_fst(writer)? {
|
let words_fst = index.main.words_fst(writer)?;
|
||||||
Some(words) => {
|
let words = if !words_fst.is_empty() {
|
||||||
let op = OpBuilder::new()
|
let op = OpBuilder::new()
|
||||||
.add(words.stream())
|
.add(words_fst.stream())
|
||||||
.add(delta_words.stream())
|
.add(delta_words.stream())
|
||||||
.r#union();
|
.r#union();
|
||||||
|
|
||||||
let mut words_builder = SetBuilder::memory();
|
let mut words_builder = SetBuilder::memory();
|
||||||
words_builder.extend_stream(op).unwrap();
|
words_builder.extend_stream(op).unwrap();
|
||||||
words_builder
|
words_builder.into_set()
|
||||||
.into_inner()
|
} else {
|
||||||
.and_then(fst::Set::from_bytes)
|
delta_words
|
||||||
.unwrap()
|
|
||||||
}
|
|
||||||
None => delta_words,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
index.main.put_words_fst(writer, &words)?;
|
index.main.put_words_fst(writer, &words)?;
|
||||||
index.main.put_ranked_map(writer, ranked_map)?;
|
index.main.put_ranked_map(writer, ranked_map)?;
|
||||||
index.main.put_number_of_documents(writer, |old| old + number_of_inserted_documents as u64)?;
|
index.main.put_number_of_documents(writer, |old| old + number_of_inserted_documents as u64)?;
|
||||||
|
|
||||||
compute_short_prefixes(writer, index)?;
|
compute_short_prefixes(writer, &words, index)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
@ -114,7 +114,8 @@ pub fn apply_documents_deletion(
|
|||||||
ranked_map.remove(id, *ranked_attr);
|
ranked_map.remove(id, *ranked_attr);
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(words) = index.docs_words.doc_words(writer, id)? {
|
let words = index.docs_words.doc_words(writer, id)?;
|
||||||
|
if !words.is_empty() {
|
||||||
let mut stream = words.stream();
|
let mut stream = words.stream();
|
||||||
while let Some(word) = stream.next() {
|
while let Some(word) = stream.next() {
|
||||||
let word = word.to_vec();
|
let word = word.to_vec();
|
||||||
@ -157,8 +158,8 @@ pub fn apply_documents_deletion(
|
|||||||
}
|
}
|
||||||
|
|
||||||
let removed_words = fst::Set::from_iter(removed_words).unwrap();
|
let removed_words = fst::Set::from_iter(removed_words).unwrap();
|
||||||
let words = match index.main.words_fst(writer)? {
|
let words = {
|
||||||
Some(words_set) => {
|
let words_set = index.main.words_fst(writer)?;
|
||||||
let op = fst::set::OpBuilder::new()
|
let op = fst::set::OpBuilder::new()
|
||||||
.add(words_set.stream())
|
.add(words_set.stream())
|
||||||
.add(removed_words.stream())
|
.add(removed_words.stream())
|
||||||
@ -166,12 +167,7 @@ pub fn apply_documents_deletion(
|
|||||||
|
|
||||||
let mut words_builder = SetBuilder::memory();
|
let mut words_builder = SetBuilder::memory();
|
||||||
words_builder.extend_stream(op).unwrap();
|
words_builder.extend_stream(op).unwrap();
|
||||||
words_builder
|
words_builder.into_set()
|
||||||
.into_inner()
|
|
||||||
.and_then(fst::Set::from_bytes)
|
|
||||||
.unwrap()
|
|
||||||
}
|
|
||||||
None => fst::Set::default(),
|
|
||||||
};
|
};
|
||||||
|
|
||||||
index.main.put_words_fst(writer, &words)?;
|
index.main.put_words_fst(writer, &words)?;
|
||||||
@ -182,7 +178,7 @@ pub fn apply_documents_deletion(
|
|||||||
index.main.remove_external_docids(writer, &external_docids)?;
|
index.main.remove_external_docids(writer, &external_docids)?;
|
||||||
index.main.remove_internal_docids(writer, &internal_docids)?;
|
index.main.remove_internal_docids(writer, &internal_docids)?;
|
||||||
|
|
||||||
compute_short_prefixes(writer, index)?;
|
compute_short_prefixes(writer, &words, index)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
@ -6,18 +6,19 @@ use meilisearch_types::DocumentId;
|
|||||||
use ordered_float::OrderedFloat;
|
use ordered_float::OrderedFloat;
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
|
|
||||||
use crate::Number;
|
use crate::{Number, FstMapCow};
|
||||||
use crate::raw_indexer::RawIndexer;
|
use crate::raw_indexer::RawIndexer;
|
||||||
use crate::serde::SerializerError;
|
use crate::serde::SerializerError;
|
||||||
use crate::store::DiscoverIds;
|
use crate::store::DiscoverIds;
|
||||||
|
|
||||||
/// Returns the number of words indexed or `None` if the type is unindexable.
|
/// Returns the number of words indexed or `None` if the type is unindexable.
|
||||||
pub fn index_value(
|
pub fn index_value<A>(
|
||||||
indexer: &mut RawIndexer,
|
indexer: &mut RawIndexer<A>,
|
||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
indexed_pos: IndexedPos,
|
indexed_pos: IndexedPos,
|
||||||
value: &Value,
|
value: &Value,
|
||||||
) -> Option<usize>
|
) -> Option<usize>
|
||||||
|
where A: AsRef<[u8]>,
|
||||||
{
|
{
|
||||||
match value {
|
match value {
|
||||||
Value::Null => None,
|
Value::Null => None,
|
||||||
@ -99,7 +100,7 @@ pub fn value_to_number(value: &Value) -> Option<Number> {
|
|||||||
/// the corresponding id or generate a new one, this is the way we produce documents ids.
|
/// the corresponding id or generate a new one, this is the way we produce documents ids.
|
||||||
pub fn discover_document_id(
|
pub fn discover_document_id(
|
||||||
docid: &str,
|
docid: &str,
|
||||||
external_docids: &fst::Map,
|
external_docids: &FstMapCow,
|
||||||
available_docids: &mut DiscoverIds<'_>,
|
available_docids: &mut DiscoverIds<'_>,
|
||||||
) -> Result<DocumentId, SerializerError>
|
) -> Result<DocumentId, SerializerError>
|
||||||
{
|
{
|
||||||
@ -120,7 +121,7 @@ pub fn discover_document_id(
|
|||||||
pub fn extract_document_id(
|
pub fn extract_document_id(
|
||||||
primary_key: &str,
|
primary_key: &str,
|
||||||
document: &IndexMap<String, Value>,
|
document: &IndexMap<String, Value>,
|
||||||
external_docids: &fst::Map,
|
external_docids: &FstMapCow,
|
||||||
available_docids: &mut DiscoverIds<'_>,
|
available_docids: &mut DiscoverIds<'_>,
|
||||||
) -> Result<(DocumentId, String), SerializerError>
|
) -> Result<(DocumentId, String), SerializerError>
|
||||||
{
|
{
|
||||||
|
@ -297,13 +297,13 @@ pub fn update_task<'a, 'b>(
|
|||||||
Ok(status)
|
Ok(status)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn compute_short_prefixes(writer: &mut heed::RwTxn<MainT>, index: &store::Index) -> MResult<()> {
|
fn compute_short_prefixes<A>(
|
||||||
// retrieve the words fst to compute all those prefixes
|
writer: &mut heed::RwTxn<MainT>,
|
||||||
let words_fst = match index.main.words_fst(writer)? {
|
words_fst: &fst::Set<A>,
|
||||||
Some(fst) => fst,
|
index: &store::Index,
|
||||||
None => return Ok(()),
|
) -> MResult<()>
|
||||||
};
|
where A: AsRef<[u8]>,
|
||||||
|
{
|
||||||
// clear the prefixes
|
// clear the prefixes
|
||||||
let pplc_store = index.prefix_postings_lists_cache;
|
let pplc_store = index.prefix_postings_lists_cache;
|
||||||
pplc_store.clear(writer)?;
|
pplc_store.clear(writer)?;
|
||||||
|
@ -168,7 +168,6 @@ pub fn apply_stop_words_update(
|
|||||||
|
|
||||||
let old_stop_words: BTreeSet<String> = index.main
|
let old_stop_words: BTreeSet<String> = index.main
|
||||||
.stop_words_fst(writer)?
|
.stop_words_fst(writer)?
|
||||||
.unwrap_or_default()
|
|
||||||
.stream()
|
.stream()
|
||||||
.into_strs()?
|
.into_strs()?
|
||||||
.into_iter()
|
.into_iter()
|
||||||
@ -186,7 +185,8 @@ pub fn apply_stop_words_update(
|
|||||||
apply_stop_words_deletion(writer, index, deletion)?;
|
apply_stop_words_deletion(writer, index, deletion)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(words_fst) = index.main.words_fst(writer)? {
|
let words_fst = index.main.words_fst(writer)?;
|
||||||
|
if !words_fst.is_empty() {
|
||||||
let stop_words = fst::Set::from_iter(stop_words)?;
|
let stop_words = fst::Set::from_iter(stop_words)?;
|
||||||
let op = OpBuilder::new()
|
let op = OpBuilder::new()
|
||||||
.add(&words_fst)
|
.add(&words_fst)
|
||||||
@ -195,7 +195,7 @@ pub fn apply_stop_words_update(
|
|||||||
|
|
||||||
let mut builder = fst::SetBuilder::memory();
|
let mut builder = fst::SetBuilder::memory();
|
||||||
builder.extend_stream(op)?;
|
builder.extend_stream(op)?;
|
||||||
let words_fst = builder.into_inner().and_then(fst::Set::from_bytes)?;
|
let words_fst = builder.into_set();
|
||||||
|
|
||||||
index.main.put_words_fst(writer, &words_fst)?;
|
index.main.put_words_fst(writer, &words_fst)?;
|
||||||
index.main.put_stop_words_fst(writer, &stop_words)?;
|
index.main.put_stop_words_fst(writer, &stop_words)?;
|
||||||
@ -222,28 +222,25 @@ fn apply_stop_words_addition(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// create the new delta stop words fst
|
// create the new delta stop words fst
|
||||||
let delta_stop_words = stop_words_builder
|
let delta_stop_words = stop_words_builder.into_set();
|
||||||
.into_inner()
|
|
||||||
.and_then(fst::Set::from_bytes)?;
|
|
||||||
|
|
||||||
// we also need to remove all the stop words from the main fst
|
// we also need to remove all the stop words from the main fst
|
||||||
if let Some(word_fst) = main_store.words_fst(writer)? {
|
let words_fst = main_store.words_fst(writer)?;
|
||||||
|
if !words_fst.is_empty() {
|
||||||
let op = OpBuilder::new()
|
let op = OpBuilder::new()
|
||||||
.add(&word_fst)
|
.add(&words_fst)
|
||||||
.add(&delta_stop_words)
|
.add(&delta_stop_words)
|
||||||
.difference();
|
.difference();
|
||||||
|
|
||||||
let mut word_fst_builder = SetBuilder::memory();
|
let mut word_fst_builder = SetBuilder::memory();
|
||||||
word_fst_builder.extend_stream(op)?;
|
word_fst_builder.extend_stream(op)?;
|
||||||
let word_fst = word_fst_builder
|
let word_fst = word_fst_builder.into_set();
|
||||||
.into_inner()
|
|
||||||
.and_then(fst::Set::from_bytes)?;
|
|
||||||
|
|
||||||
main_store.put_words_fst(writer, &word_fst)?;
|
main_store.put_words_fst(writer, &word_fst)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// now we add all of these stop words from the main store
|
// now we add all of these stop words from the main store
|
||||||
let stop_words_fst = main_store.stop_words_fst(writer)?.unwrap_or_default();
|
let stop_words_fst = main_store.stop_words_fst(writer)?;
|
||||||
|
|
||||||
let op = OpBuilder::new()
|
let op = OpBuilder::new()
|
||||||
.add(&stop_words_fst)
|
.add(&stop_words_fst)
|
||||||
@ -252,9 +249,7 @@ fn apply_stop_words_addition(
|
|||||||
|
|
||||||
let mut stop_words_builder = SetBuilder::memory();
|
let mut stop_words_builder = SetBuilder::memory();
|
||||||
stop_words_builder.extend_stream(op)?;
|
stop_words_builder.extend_stream(op)?;
|
||||||
let stop_words_fst = stop_words_builder
|
let stop_words_fst = stop_words_builder.into_set();
|
||||||
.into_inner()
|
|
||||||
.and_then(fst::Set::from_bytes)?;
|
|
||||||
|
|
||||||
main_store.put_stop_words_fst(writer, &stop_words_fst)?;
|
main_store.put_stop_words_fst(writer, &stop_words_fst)?;
|
||||||
|
|
||||||
@ -274,12 +269,10 @@ fn apply_stop_words_deletion(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// create the new delta stop words fst
|
// create the new delta stop words fst
|
||||||
let delta_stop_words = stop_words_builder
|
let delta_stop_words = stop_words_builder.into_set();
|
||||||
.into_inner()
|
|
||||||
.and_then(fst::Set::from_bytes)?;
|
|
||||||
|
|
||||||
// now we delete all of these stop words from the main store
|
// now we delete all of these stop words from the main store
|
||||||
let stop_words_fst = index.main.stop_words_fst(writer)?.unwrap_or_default();
|
let stop_words_fst = index.main.stop_words_fst(writer)?;
|
||||||
|
|
||||||
let op = OpBuilder::new()
|
let op = OpBuilder::new()
|
||||||
.add(&stop_words_fst)
|
.add(&stop_words_fst)
|
||||||
@ -288,7 +281,7 @@ fn apply_stop_words_deletion(
|
|||||||
|
|
||||||
let mut stop_words_builder = SetBuilder::memory();
|
let mut stop_words_builder = SetBuilder::memory();
|
||||||
stop_words_builder.extend_stream(op)?;
|
stop_words_builder.extend_stream(op)?;
|
||||||
let stop_words_fst = stop_words_builder.into_inner().and_then(fst::Set::from_bytes)?;
|
let stop_words_fst = stop_words_builder.into_set();
|
||||||
|
|
||||||
Ok(index.main.put_stop_words_fst(writer, &stop_words_fst)?)
|
Ok(index.main.put_stop_words_fst(writer, &stop_words_fst)?)
|
||||||
}
|
}
|
||||||
@ -311,16 +304,13 @@ pub fn apply_synonyms_update(
|
|||||||
let alternatives = SetBuf::from_dirty(alternatives);
|
let alternatives = SetBuf::from_dirty(alternatives);
|
||||||
let mut alternatives_builder = SetBuilder::memory();
|
let mut alternatives_builder = SetBuilder::memory();
|
||||||
alternatives_builder.extend_iter(alternatives)?;
|
alternatives_builder.extend_iter(alternatives)?;
|
||||||
let bytes = alternatives_builder.into_inner()?;
|
alternatives_builder.into_set()
|
||||||
fst::Set::from_bytes(bytes)?
|
|
||||||
};
|
};
|
||||||
|
|
||||||
synonyms_store.put_synonyms(writer, word.as_bytes(), &alternatives)?;
|
synonyms_store.put_synonyms(writer, word.as_bytes(), &alternatives)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let synonyms_set = synonyms_builder
|
let synonyms_set = synonyms_builder.into_set();
|
||||||
.into_inner()
|
|
||||||
.and_then(fst::Set::from_bytes)?;
|
|
||||||
|
|
||||||
main_store.put_synonyms_fst(writer, &synonyms_set)?;
|
main_store.put_synonyms_fst(writer, &synonyms_set)?;
|
||||||
|
|
||||||
|
@ -63,21 +63,19 @@ async fn get_all(
|
|||||||
let reader = data.db.main_read_txn()?;
|
let reader = data.db.main_read_txn()?;
|
||||||
|
|
||||||
let stop_words_fst = index.main.stop_words_fst(&reader)?;
|
let stop_words_fst = index.main.stop_words_fst(&reader)?;
|
||||||
let stop_words = stop_words_fst.unwrap_or_default().stream().into_strs()?;
|
let stop_words = stop_words_fst.stream().into_strs()?;
|
||||||
let stop_words: BTreeSet<String> = stop_words.into_iter().collect();
|
let stop_words: BTreeSet<String> = stop_words.into_iter().collect();
|
||||||
|
|
||||||
let synonyms_fst = index.main.synonyms_fst(&reader)?.unwrap_or_default();
|
let synonyms_fst = index.main.synonyms_fst(&reader)?;
|
||||||
let synonyms_list = synonyms_fst.stream().into_strs()?;
|
let synonyms_list = synonyms_fst.stream().into_strs()?;
|
||||||
|
|
||||||
let mut synonyms = BTreeMap::new();
|
let mut synonyms = BTreeMap::new();
|
||||||
let index_synonyms = &index.synonyms;
|
let index_synonyms = &index.synonyms;
|
||||||
for synonym in synonyms_list {
|
for synonym in synonyms_list {
|
||||||
let alternative_list = index_synonyms.synonyms(&reader, synonym.as_bytes())?;
|
let alternative_list = index_synonyms.synonyms(&reader, synonym.as_bytes())?;
|
||||||
if let Some(list) = alternative_list {
|
let list = alternative_list.stream().into_strs()?;
|
||||||
let list = list.stream().into_strs()?;
|
|
||||||
synonyms.insert(synonym, list);
|
synonyms.insert(synonym, list);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
let ranking_rules = index
|
let ranking_rules = index
|
||||||
.main
|
.main
|
||||||
|
@ -26,7 +26,7 @@ async fn get(
|
|||||||
.ok_or(ResponseError::index_not_found(&path.index_uid))?;
|
.ok_or(ResponseError::index_not_found(&path.index_uid))?;
|
||||||
let reader = data.db.main_read_txn()?;
|
let reader = data.db.main_read_txn()?;
|
||||||
let stop_words_fst = index.main.stop_words_fst(&reader)?;
|
let stop_words_fst = index.main.stop_words_fst(&reader)?;
|
||||||
let stop_words = stop_words_fst.unwrap_or_default().stream().into_strs()?;
|
let stop_words = stop_words_fst.stream().into_strs()?;
|
||||||
|
|
||||||
Ok(HttpResponse::Ok().json(stop_words))
|
Ok(HttpResponse::Ok().json(stop_words))
|
||||||
}
|
}
|
||||||
|
@ -29,19 +29,16 @@ async fn get(
|
|||||||
|
|
||||||
let reader = data.db.main_read_txn()?;
|
let reader = data.db.main_read_txn()?;
|
||||||
|
|
||||||
let synonyms_fst = index.main.synonyms_fst(&reader)?.unwrap_or_default();
|
let synonyms_fst = index.main.synonyms_fst(&reader)?;
|
||||||
let synonyms_list = synonyms_fst.stream().into_strs()?;
|
let synonyms_list = synonyms_fst.stream().into_strs()?;
|
||||||
|
|
||||||
let mut synonyms = IndexMap::new();
|
let mut synonyms = IndexMap::new();
|
||||||
let index_synonyms = &index.synonyms;
|
let index_synonyms = &index.synonyms;
|
||||||
for synonym in synonyms_list {
|
for synonym in synonyms_list {
|
||||||
let alternative_list = index_synonyms.synonyms(&reader, synonym.as_bytes())?;
|
let alternative_list = index_synonyms.synonyms(&reader, synonym.as_bytes())?;
|
||||||
|
let list = alternative_list.stream().into_strs()?;
|
||||||
if let Some(list) = alternative_list {
|
|
||||||
let list = list.stream().into_strs()?;
|
|
||||||
synonyms.insert(synonym, list);
|
synonyms.insert(synonym, list);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
Ok(HttpResponse::Ok().json(synonyms))
|
Ok(HttpResponse::Ok().json(synonyms))
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user