From e39aabbfe6e35b40261910d09c9b5b13cc2dfaa5 Mon Sep 17 00:00:00 2001 From: Alexey Shekhirin Date: Wed, 7 Apr 2021 11:53:57 +0300 Subject: [PATCH] feat(search, update): synonyms --- http-ui/src/main.rs | 12 ++++++ milli/src/index.rs | 52 +++++++++++++++++++----- milli/src/search/query_tree.rs | 21 +++++----- milli/src/update/settings.rs | 73 ++++++++++++++++++++++++++++++++-- 4 files changed, 132 insertions(+), 26 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 08e28be56..605b6a7ba 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -260,6 +260,9 @@ struct Settings { #[serde(default, skip_serializing_if = "Setting::is_not_set")] stop_words: Setting>, + + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + synonyms: Setting>>, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -431,6 +434,13 @@ async fn main() -> anyhow::Result<()> { Setting::NotSet => () } + // We transpose the settings JSON struct into a real setting update. + match settings.synonyms { + Setting::Set(synonyms) => builder.set_synonyms(synonyms), + Setting::Reset => builder.reset_synonyms(), + Setting::NotSet => () + } + let result = builder.execute(|indexing_step, update_id| { let (current, total) = match indexing_step { TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None), @@ -1011,6 +1021,7 @@ mod tests { faceted_attributes: Setting::Set(hashmap! { "age".into() => "integer".into() }), criteria: Setting::Set(vec!["asc(age)".to_string()]), stop_words: Setting::Set(btreeset! { "and".to_string() }), + synonyms: Setting::NotSet }; assert_tokens(&settings, &[ @@ -1053,6 +1064,7 @@ mod tests { faceted_attributes: Setting::Reset, criteria: Setting::Reset, stop_words: Setting::Reset, + synonyms: Setting::NotSet }; assert_tokens(&settings, &[ diff --git a/milli/src/index.rs b/milli/src/index.rs index 7be618789..d743445e3 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -3,19 +3,19 @@ use std::collections::HashMap; use std::path::Path; use anyhow::Context; +use chrono::{DateTime, Utc}; +use heed::{Database, PolyDatabase, RoTxn, RwTxn}; use heed::types::*; -use heed::{PolyDatabase, Database, RwTxn, RoTxn}; use roaring::RoaringBitmap; -use chrono::{Utc, DateTime}; +use crate::{Criterion, default_criteria, FacetDistribution, FieldsDistribution, Search}; +use crate::{BEU32, DocumentId, ExternalDocumentsIds, FieldId}; +use crate::{ + BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, + ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrStrU8Codec, +}; use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; -use crate::{default_criteria, Criterion, Search, FacetDistribution, FieldsDistribution}; -use crate::{BEU32, DocumentId, FieldId, ExternalDocumentsIds}; -use crate::{ - RoaringBitmapCodec, RoaringBitmapLenCodec, BEU32StrCodec, - StrStrU8Codec, ObkvCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, -}; pub const CRITERIA_KEY: &str = "criteria"; pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; @@ -31,6 +31,7 @@ pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids"; pub const WORDS_FST_KEY: &str = "words-fst"; pub const STOP_WORDS_KEY: &str = "stop-words"; +pub const SYNONYMS_KEY: &str = "synonyms"; pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst"; const CREATED_AT_KEY: &str = "created-at"; const UPDATED_AT_KEY: &str = "updated-at"; @@ -376,12 +377,12 @@ impl Index { /* words fst */ - /// Writes the FST which is the words dictionnary of the engine. + /// Writes the FST which is the words dictionary of the engine. pub fn put_words_fst>(&self, wtxn: &mut RwTxn, fst: &fst::Set) -> heed::Result<()> { self.main.put::<_, Str, ByteSlice>(wtxn, WORDS_FST_KEY, fst.as_fst().as_bytes()) } - /// Returns the FST which is the words dictionnary of the engine. + /// Returns the FST which is the words dictionary of the engine. pub fn words_fst<'t>(&self, rtxn: &'t RoTxn) -> anyhow::Result>> { match self.main.get::<_, Str, ByteSlice>(rtxn, WORDS_FST_KEY)? { Some(bytes) => Ok(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?), @@ -398,6 +399,7 @@ impl Index { pub fn delete_stop_words(&self, wtxn: &mut RwTxn) -> heed::Result { self.main.delete::<_, Str>(wtxn, STOP_WORDS_KEY) } + pub fn stop_words<'t>(&self, rtxn: &'t RoTxn) -> anyhow::Result>> { match self.main.get::<_, Str, ByteSlice>(rtxn, STOP_WORDS_KEY)? { Some(bytes) => Ok(Some(fst::Set::new(bytes)?)), @@ -405,6 +407,34 @@ impl Index { } } + /* synonyms */ + + pub fn put_synonyms(&self, wtxn: &mut RwTxn, synonyms: &HashMap, Vec>>) -> heed::Result<()> { + self.main.put::<_, Str, SerdeBincode<_>>(wtxn, SYNONYMS_KEY, synonyms) + } + + pub fn delete_synonyms(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, SYNONYMS_KEY) + } + + pub fn synonyms(&self, rtxn: &RoTxn) -> anyhow::Result, Vec>>>> { + match self.main.get::<_, Str, SerdeBincode, Vec>>>>(rtxn, SYNONYMS_KEY)? { + Some(synonyms) => Ok(Some(synonyms)), + None => Ok(None), + } + } + + pub fn words_synonyms>(&self, rtxn: &RoTxn, words: &[S]) -> anyhow::Result>>> { + let words: Vec<_> = words.iter().map(|s| s.as_ref().to_string()).collect(); + + match self.synonyms(rtxn)? { + Some(synonyms) => Ok(Some( + synonyms.get(&words).cloned().unwrap_or(Vec::default()) + )), + None => Ok(None) + } + } + /* words prefixes fst */ /// Writes the FST which is the words prefixes dictionnary of the engine. @@ -536,7 +566,7 @@ pub(crate) mod tests { let rtxn = index.read_txn().unwrap(); let fields_distribution = index.fields_distribution(&rtxn).unwrap(); - assert_eq!(fields_distribution, hashmap!{ + assert_eq!(fields_distribution, hashmap! { "name".to_string() => 2, "age".to_string() => 1, }); diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 1941f0c6f..b2fd62771 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -155,7 +155,7 @@ impl fmt::Debug for Query { trait Context { fn word_docids(&self, word: &str) -> heed::Result>; - fn synonyms>(&self, words: &[S]) -> heed::Result>>>; + fn synonyms>(&self, words: &[S]) -> anyhow::Result>>>; fn word_documents_count(&self, word: &str) -> heed::Result> { match self.word_docids(word)? { Some(rb) => Ok(Some(rb.len())), @@ -177,12 +177,12 @@ impl<'a> Context for QueryTreeBuilder<'a> { self.index.word_docids.get(self.rtxn, word) } - fn word_documents_count(&self, word: &str) -> heed::Result> { - self.index.word_documents_count(self.rtxn, word) + fn synonyms>(&self, words: &[S]) -> anyhow::Result>>> { + self.index.words_synonyms(self.rtxn, words) } - fn synonyms>(&self, _words: &[S]) -> heed::Result>>> { - Ok(None) + fn word_documents_count(&self, word: &str) -> heed::Result> { + self.index.word_documents_count(self.rtxn, word) } } @@ -270,10 +270,10 @@ fn typos(word: String, authorize_typos: bool) -> QueryKind { } } -/// Fetch synonyms from the `Context` for the provided word +/// Fetch synonyms from the `Context` for the provided words /// and create the list of operations for the query tree -fn synonyms(ctx: &impl Context, word: &[&str]) -> heed::Result>> { - let synonyms = ctx.synonyms(word)?; +fn synonyms(ctx: &impl Context, words: &[&str]) -> anyhow::Result>> { + let synonyms = ctx.synonyms(words)?; Ok(synonyms.map(|synonyms| { synonyms.into_iter().map(|synonym| { @@ -581,14 +581,13 @@ mod test { Ok(self.postings.get(word).cloned()) } - fn synonyms>(&self, words: &[S]) -> heed::Result>>> { - let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); + fn synonyms>(&self, words: &[S]) -> anyhow::Result>>> { + let words: Vec<_> = words.iter().map(|s| s.as_ref().to_string()).collect(); Ok(self.synonyms.get(&words).cloned()) } } impl Default for TestContext { - fn default() -> TestContext { let mut rng = StdRng::seed_from_u64(102); let rng = &mut rng; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index e63948082..336c0e253 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -13,6 +13,7 @@ use crate::criterion::Criterion; use crate::facet::FacetType; use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep}; use crate::update::index_documents::{IndexDocumentsMethod, Transform}; +use meilisearch_tokenizer::{AnalyzerConfig, Analyzer}; #[derive(Debug, Clone, PartialEq)] pub enum Setting { @@ -71,6 +72,7 @@ pub struct Settings<'a, 't, 'u, 'i> { criteria: Setting>, stop_words: Setting>, distinct_attribute: Setting, + synonyms: Setting>>, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -96,6 +98,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { criteria: Setting::NotSet, stop_words: Setting::NotSet, distinct_attribute: Setting::NotSet, + synonyms: Setting::NotSet, update_id, } } @@ -144,12 +147,24 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } + pub fn reset_distinct_attribute(&mut self) { + self.distinct_attribute = Setting::Reset; + } + pub fn set_distinct_attribute(&mut self, distinct_attribute: String) { self.distinct_attribute = Setting::Set(distinct_attribute); } - pub fn reset_distinct_attribute(&mut self) { - self.distinct_attribute = Setting::Reset; + pub fn reset_synonyms(&mut self) { + self.synonyms = Setting::Reset; + } + + pub fn set_synonyms(&mut self, synonyms: HashMap>) { + self.synonyms = if synonyms.is_empty() { + Setting::Reset + } else { + Setting::Set(synonyms) + } } fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> anyhow::Result<()> @@ -294,7 +309,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { let current = self.index.stop_words(self.wtxn)?; // since we can't compare a BTreeSet with an FST we are going to convert the // BTreeSet to an FST and then compare bytes per bytes the two FSTs. - let fst = fst::Set::from_iter(&*stop_words)?; + let fst = fst::Set::from_iter(stop_words)?; // Does the new FST differ from the previous one? if current.map_or(true, |current| current.as_fst().as_bytes() != fst.as_fst().as_bytes()) { @@ -310,6 +325,55 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } + fn update_synonyms(&mut self) -> anyhow::Result { + match self.synonyms { + Setting::Set(ref synonyms) => { + let old_synonyms = self.index.synonyms(self.wtxn)?.unwrap_or_default(); + + let mut config = AnalyzerConfig::default(); + + let stop_words = self.index.stop_words(self.wtxn)?; + if let Some(stop_words) = &stop_words { + config.stop_words(stop_words); + } + + let analyzer = Analyzer::new(config); + + let normalize = |text: &String| { + analyzer + .analyze(text) + .tokens() + .filter_map(|token| + if token.is_word() { Some(token.text().to_string()) } else { None } + ) + .collect::>() + }; + + let new_synonyms = synonyms + .iter() + .map(|(word, synonyms)| { + let normalized_word = normalize(word); + let normalized_synonyms = synonyms.iter() + .map(normalize) + .unique() + .collect::>(); + + (normalized_word, normalized_synonyms) + }) + .collect(); + + if new_synonyms != old_synonyms { + self.index.put_synonyms(self.wtxn, &new_synonyms)?; + Ok(true) + } else { + Ok(false) + } + } + Setting::Reset => Ok(self.index.delete_synonyms(self.wtxn)?), + Setting::NotSet => Ok(false), + } + } + fn update_facets(&mut self) -> anyhow::Result { match self.faceted_fields { Setting::Set(ref fields) => { @@ -359,9 +423,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { // update_criteria MUST be called after update_facets, since criterion fields must be set // as facets. self.update_criteria()?; + let synonyms_updated = self.update_synonyms()?; let searchable_updated = self.update_searchable()?; - if facets_updated || searchable_updated || stop_words_updated { + if stop_words_updated || facets_updated || synonyms_updated || searchable_updated { self.reindex(&progress_callback, old_fields_ids_map)?; } Ok(())