typos on first letter counts as 2

This commit is contained in:
mpostma 2022-01-20 18:35:11 +01:00 committed by ad hoc
parent 642c01d0dc
commit 55e6cb9c7b
No known key found for this signature in database
GPG Key ID: 4F00A782990CC643
2 changed files with 32 additions and 8 deletions

View File

@ -18,3 +18,6 @@ opt-level = 3
opt-level = 3
[profile.test.build-override]
opt-level = 3
[patch.crates-io]
fst = { path = "/Users/mpostma/Documents/code/rust/fst/" }

View File

@ -7,7 +7,8 @@ use std::str::Utf8Error;
use std::time::Instant;
use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct};
use fst::{IntoStreamer, Streamer};
use fst::automaton::Str;
use fst::{Automaton, IntoStreamer, Streamer};
use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA};
use log::debug;
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
@ -285,6 +286,18 @@ pub fn word_derivations<'c>(
Entry::Vacant(entry) => {
let mut derived_words = Vec::new();
let dfa = build_dfa(word, max_typo, is_prefix);
if max_typo == 1 {
let starts = Str::new(get_first(word));
let mut stream = fst.search_with_state(starts.intersection(&dfa)).into_stream();
while let Some((word, state)) = stream.next() {
let word = std::str::from_utf8(word)?;
let distance = dfa.distance(state.1);
derived_words.push((word.to_string(), distance.to_u8()));
}
Ok(entry.insert(derived_words))
} else {
let mut stream = fst.search_with_state(&dfa).into_stream();
while let Some((word, state)) = stream.next() {
@ -296,6 +309,14 @@ pub fn word_derivations<'c>(
Ok(entry.insert(derived_words))
}
}
}
}
fn get_first(s: &str) -> &str {
match s.chars().next() {
Some(c) => &s[..c.len_utf8()],
None => s,
}
}
pub fn build_dfa(word: &str, typos: u8, is_prefix: bool) -> DFA {