From 3a2451fcbad9ab29dae49e281d56210d06bf9536 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Thu, 21 Apr 2022 13:52:09 +0200 Subject: [PATCH 1/2] add test normalize exact words --- milli/src/update/settings.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index ff59249b7..3c0c0fbee 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1461,4 +1461,22 @@ mod tests { builder.set_min_word_len_two_typos(7); assert!(builder.execute(|_| ()).is_err()); } + + #[test] + fn update_exact_words_normalization() { + let index = TempIndex::new(); + let config = IndexerConfig::default(); + + // Set the genres setting + let mut txn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut txn, &index, &config); + + let words = btreeset! { S("Ab"), S("ac") }; + builder.set_exact_words(words); + assert!(builder.execute(|_| ()).is_ok()); + let exact_words = index.exact_words(&txn).unwrap(); + for word in exact_words.into_fst().stream().into_str_vec().unwrap() { + assert!(word.0 == "ac" || word.0 == "ab"); + } + } } From 2e0089d5ff65adad7351b821b0bcb7eb7004479e Mon Sep 17 00:00:00 2001 From: ad hoc Date: Thu, 21 Apr 2022 14:09:33 +0200 Subject: [PATCH 2/2] normalize exact words --- milli/src/update/settings.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 3c0c0fbee..d49915787 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -580,6 +580,23 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn update_exact_words(&mut self) -> Result<()> { match self.exact_words { Setting::Set(ref mut words) => { + fn normalize(analyzer: &Analyzer<&[u8]>, text: &str) -> String { + analyzer.analyze(text).tokens().map(|token| token.text().to_string()).collect() + } + + let mut config = AnalyzerConfig::default(); + let stop_words = self.index.stop_words(self.wtxn)?; + if let Some(stop_words) = &stop_words { + config.stop_words(stop_words); + } + let analyzer = Analyzer::new(config); + + let mut words: Vec<_> = + words.iter().map(|word| normalize(&analyzer, word)).collect(); + + // normalization could reorder words + words.sort_unstable(); + let words = fst::Set::from_iter(words.iter())?; self.index.put_exact_words(&mut self.wtxn, &words)?; }