Implement the facet search with the normalized index

This commit is contained in:
Kerollmops 2023-07-24 17:52:08 +02:00
parent df528b41d8
commit 691a536893
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4

View File

@ -1,5 +1,8 @@
use std::fmt; use std::fmt;
use std::ops::ControlFlow;
use charabia::normalizer::NormalizerOption;
use charabia::Normalize;
use fst::automaton::{Automaton, Str}; use fst::automaton::{Automaton, Str};
use fst::{IntoStreamer, Streamer}; use fst::{IntoStreamer, Streamer};
use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA}; use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA};
@ -14,8 +17,8 @@ use crate::error::UserError;
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue}; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue};
use crate::score_details::{ScoreDetails, ScoringStrategy}; use crate::score_details::{ScoreDetails, ScoringStrategy};
use crate::{ use crate::{
execute_search, normalize_facet, AscDesc, DefaultSearchLogger, DocumentId, FieldId, Index, execute_search, AscDesc, DefaultSearchLogger, DocumentId, FieldId, Index, Result,
Result, SearchContext, BEU16, SearchContext, BEU16,
}; };
// Building these factories is not free. // Building these factories is not free.
@ -301,29 +304,28 @@ impl<'a> SearchForFacetValues<'a> {
match self.query.as_ref() { match self.query.as_ref() {
Some(query) => { Some(query) => {
let query = normalize_facet(query); let options = NormalizerOption { lossy: true, ..Default::default() };
let query = query.as_str(); let query = query.normalize(&options);
let query = query.as_ref();
let authorize_typos = self.search_query.index.authorize_typos(rtxn)?; let authorize_typos = self.search_query.index.authorize_typos(rtxn)?;
let field_authorizes_typos = let field_authorizes_typos =
!self.search_query.index.exact_attributes_ids(rtxn)?.contains(&fid); !self.search_query.index.exact_attributes_ids(rtxn)?.contains(&fid);
if authorize_typos && field_authorizes_typos { if authorize_typos && field_authorizes_typos {
let mut results = vec![];
let exact_words_fst = self.search_query.index.exact_words(rtxn)?; let exact_words_fst = self.search_query.index.exact_words(rtxn)?;
if exact_words_fst.map_or(false, |fst| fst.contains(query)) { if exact_words_fst.map_or(false, |fst| fst.contains(query)) {
let key = FacetGroupKey { field_id: fid, level: 0, left_bound: query }; let mut results = vec![];
if let Some(FacetGroupValue { bitmap, .. }) = if fst.contains(query) {
index.facet_id_string_docids.get(rtxn, &key)? self.fetch_original_facets_using_normalized(
{ fid,
let count = search_candidates.intersection_len(&bitmap); query,
if count != 0 { query,
let value = self &search_candidates,
.one_original_value_of(fid, query, bitmap.min().unwrap())? &mut results,
.unwrap_or_else(|| query.to_string()); )?;
results.push(FacetValueHit { value, count });
}
} }
Ok(results)
} else { } else {
let one_typo = self.search_query.index.min_word_len_one_typo(rtxn)?; let one_typo = self.search_query.index.min_word_len_one_typo(rtxn)?;
let two_typos = self.search_query.index.min_word_len_two_typos(rtxn)?; let two_typos = self.search_query.index.min_word_len_two_typos(rtxn)?;
@ -338,81 +340,41 @@ impl<'a> SearchForFacetValues<'a> {
}; };
let mut stream = fst.search(automaton).into_stream(); let mut stream = fst.search(automaton).into_stream();
let mut length = 0; let mut results = vec![];
'outer: while let Some(facet_value) = stream.next() { while let Some(facet_value) = stream.next() {
let value = std::str::from_utf8(facet_value)?; let value = std::str::from_utf8(facet_value)?;
let database = index.facet_id_normalized_string_strings; if self
let key = (fid, value); .fetch_original_facets_using_normalized(
let original_strings = match database.get(rtxn, &key)? { fid,
Some(original_strings) => original_strings, value,
None => { query,
error!( &search_candidates,
"the facet value is missing from the facet database: {key:?}" &mut results,
); )?
continue; .is_break()
} {
}; break;
for original_string in original_strings {
let key = FacetGroupKey {
field_id: fid,
level: 0,
left_bound: original_string.as_str(),
};
let docids = match index.facet_id_string_docids.get(rtxn, &key)? {
Some(FacetGroupValue { bitmap, .. }) => bitmap,
None => {
error!(
"the facet value is missing from the facet database: {key:?}"
);
continue;
}
};
let count = search_candidates.intersection_len(&docids);
if count != 0 {
let value = self
.one_original_value_of(
fid,
&original_string,
docids.min().unwrap(),
)?
.unwrap_or_else(|| query.to_string());
results.push(FacetValueHit { value, count });
length += 1;
}
if length >= MAX_NUMBER_OF_FACETS {
break 'outer;
}
} }
} }
}
Ok(results) Ok(results)
}
} else { } else {
let automaton = Str::new(query).starts_with(); let automaton = Str::new(query).starts_with();
let mut stream = fst.search(automaton).into_stream(); let mut stream = fst.search(automaton).into_stream();
let mut results = vec![]; let mut results = vec![];
let mut length = 0;
while let Some(facet_value) = stream.next() { while let Some(facet_value) = stream.next() {
let value = std::str::from_utf8(facet_value)?; let value = std::str::from_utf8(facet_value)?;
let key = FacetGroupKey { field_id: fid, level: 0, left_bound: value }; if self
let docids = match index.facet_id_string_docids.get(rtxn, &key)? { .fetch_original_facets_using_normalized(
Some(FacetGroupValue { bitmap, .. }) => bitmap, fid,
None => { value,
error!( query,
"the facet value is missing from the facet database: {key:?}" &search_candidates,
); &mut results,
continue; )?
} .is_break()
}; {
let count = search_candidates.intersection_len(&docids);
if count != 0 {
let value = self
.one_original_value_of(fid, value, docids.min().unwrap())?
.unwrap_or_else(|| query.to_string());
results.push(FacetValueHit { value, count });
length += 1;
}
if length >= MAX_NUMBER_OF_FACETS {
break; break;
} }
} }
@ -422,7 +384,6 @@ impl<'a> SearchForFacetValues<'a> {
} }
None => { None => {
let mut results = vec![]; let mut results = vec![];
let mut length = 0;
let prefix = FacetGroupKey { field_id: fid, level: 0, left_bound: "" }; let prefix = FacetGroupKey { field_id: fid, level: 0, left_bound: "" };
for result in index.facet_id_string_docids.prefix_iter(rtxn, &prefix)? { for result in index.facet_id_string_docids.prefix_iter(rtxn, &prefix)? {
let (FacetGroupKey { left_bound, .. }, FacetGroupValue { bitmap, .. }) = let (FacetGroupKey { left_bound, .. }, FacetGroupValue { bitmap, .. }) =
@ -433,9 +394,8 @@ impl<'a> SearchForFacetValues<'a> {
.one_original_value_of(fid, left_bound, bitmap.min().unwrap())? .one_original_value_of(fid, left_bound, bitmap.min().unwrap())?
.unwrap_or_else(|| left_bound.to_string()); .unwrap_or_else(|| left_bound.to_string());
results.push(FacetValueHit { value, count }); results.push(FacetValueHit { value, count });
length += 1;
} }
if length >= MAX_NUMBER_OF_FACETS { if results.len() >= MAX_NUMBER_OF_FACETS {
break; break;
} }
} }
@ -443,6 +403,50 @@ impl<'a> SearchForFacetValues<'a> {
} }
} }
} }
fn fetch_original_facets_using_normalized(
&self,
fid: FieldId,
value: &str,
query: &str,
search_candidates: &RoaringBitmap,
results: &mut Vec<FacetValueHit>,
) -> Result<ControlFlow<()>> {
let index = self.search_query.index;
let rtxn = self.search_query.rtxn;
let database = index.facet_id_normalized_string_strings;
let key = (fid, value);
let original_strings = match database.get(rtxn, &key)? {
Some(original_strings) => original_strings,
None => {
error!("the facet value is missing from the facet database: {key:?}");
return Ok(ControlFlow::Continue(()));
}
};
for original in original_strings {
let key = FacetGroupKey { field_id: fid, level: 0, left_bound: original.as_str() };
let docids = match index.facet_id_string_docids.get(rtxn, &key)? {
Some(FacetGroupValue { bitmap, .. }) => bitmap,
None => {
error!("the facet value is missing from the facet database: {key:?}");
return Ok(ControlFlow::Continue(()));
}
};
let count = search_candidates.intersection_len(&docids);
if count != 0 {
let value = self
.one_original_value_of(fid, &original, docids.min().unwrap())?
.unwrap_or_else(|| query.to_string());
results.push(FacetValueHit { value, count });
}
if results.len() >= MAX_NUMBER_OF_FACETS {
return Ok(ControlFlow::Break(()));
}
}
Ok(ControlFlow::Continue(()))
}
} }
#[derive(Debug, Clone, serde::Serialize, PartialEq)] #[derive(Debug, Clone, serde::Serialize, PartialEq)]