From 3f24555c3d16b3078ef0182980341e2fbdc3ea43 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 15 Mar 2022 17:28:57 +0100 Subject: [PATCH] custom fst automatons --- Cargo.toml | 3 - milli/src/search/fst_utils.rs | 187 ++++++++++++++++++++++++++++++++++ milli/src/search/mod.rs | 16 +-- 3 files changed, 196 insertions(+), 10 deletions(-) create mode 100644 milli/src/search/fst_utils.rs diff --git a/Cargo.toml b/Cargo.toml index 52599b1bd..6b3e12f07 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,3 @@ opt-level = 3 opt-level = 3 [profile.test.build-override] opt-level = 3 - -[patch.crates-io] -fst = { git = "https://github.com/MarinPostma/fst.git", rev = "e6c606b7507e8cb5e502d1609f9b909b8690bac5" } diff --git a/milli/src/search/fst_utils.rs b/milli/src/search/fst_utils.rs new file mode 100644 index 000000000..b488e6c19 --- /dev/null +++ b/milli/src/search/fst_utils.rs @@ -0,0 +1,187 @@ +/// This mod is necessary until https://github.com/BurntSushi/fst/pull/137 gets merged. +/// All credits for this code go to BurntSushi. +use fst::Automaton; + +pub struct StartsWith(pub A); + +/// The `Automaton` state for `StartsWith`. +pub struct StartsWithState(pub StartsWithStateKind); + +impl Clone for StartsWithState +where + A::State: Clone, +{ + fn clone(&self) -> Self { + Self(self.0.clone()) + } +} + +/// The inner state of a `StartsWithState`. +pub enum StartsWithStateKind { + /// Sink state that is reached when the automaton has matched the prefix. + Done, + /// State in which the automaton is while it hasn't matched the prefix. + Running(A::State), +} + +impl Clone for StartsWithStateKind +where + A::State: Clone, +{ + fn clone(&self) -> Self { + match self { + StartsWithStateKind::Done => StartsWithStateKind::Done, + StartsWithStateKind::Running(inner) => StartsWithStateKind::Running(inner.clone()), + } + } +} + +impl Automaton for StartsWith { + type State = StartsWithState; + + fn start(&self) -> StartsWithState { + StartsWithState({ + let inner = self.0.start(); + if self.0.is_match(&inner) { + StartsWithStateKind::Done + } else { + StartsWithStateKind::Running(inner) + } + }) + } + fn is_match(&self, state: &StartsWithState) -> bool { + match state.0 { + StartsWithStateKind::Done => true, + StartsWithStateKind::Running(_) => false, + } + } + fn can_match(&self, state: &StartsWithState) -> bool { + match state.0 { + StartsWithStateKind::Done => true, + StartsWithStateKind::Running(ref inner) => self.0.can_match(inner), + } + } + fn will_always_match(&self, state: &StartsWithState) -> bool { + match state.0 { + StartsWithStateKind::Done => true, + StartsWithStateKind::Running(_) => false, + } + } + fn accept(&self, state: &StartsWithState, byte: u8) -> StartsWithState { + StartsWithState(match state.0 { + StartsWithStateKind::Done => StartsWithStateKind::Done, + StartsWithStateKind::Running(ref inner) => { + let next_inner = self.0.accept(inner, byte); + if self.0.is_match(&next_inner) { + StartsWithStateKind::Done + } else { + StartsWithStateKind::Running(next_inner) + } + } + }) + } +} +/// An automaton that matches when one of its component automata match. +#[derive(Clone, Debug)] +pub struct Union(pub A, pub B); + +/// The `Automaton` state for `Union`. +pub struct UnionState(pub A::State, pub B::State); + +impl Clone for UnionState +where + A::State: Clone, + B::State: Clone, +{ + fn clone(&self) -> Self { + Self(self.0.clone(), self.1.clone()) + } +} + +impl Automaton for Union { + type State = UnionState; + fn start(&self) -> UnionState { + UnionState(self.0.start(), self.1.start()) + } + fn is_match(&self, state: &UnionState) -> bool { + self.0.is_match(&state.0) || self.1.is_match(&state.1) + } + fn can_match(&self, state: &UnionState) -> bool { + self.0.can_match(&state.0) || self.1.can_match(&state.1) + } + fn will_always_match(&self, state: &UnionState) -> bool { + self.0.will_always_match(&state.0) || self.1.will_always_match(&state.1) + } + fn accept(&self, state: &UnionState, byte: u8) -> UnionState { + UnionState(self.0.accept(&state.0, byte), self.1.accept(&state.1, byte)) + } +} +/// An automaton that matches when both of its component automata match. +#[derive(Clone, Debug)] +pub struct Intersection(pub A, pub B); + +/// The `Automaton` state for `Intersection`. +pub struct IntersectionState(pub A::State, pub B::State); + +impl Clone for IntersectionState +where + A::State: Clone, + B::State: Clone, +{ + fn clone(&self) -> Self { + Self(self.0.clone(), self.1.clone()) + } +} + +impl Automaton for Intersection { + type State = IntersectionState; + fn start(&self) -> IntersectionState { + IntersectionState(self.0.start(), self.1.start()) + } + fn is_match(&self, state: &IntersectionState) -> bool { + self.0.is_match(&state.0) && self.1.is_match(&state.1) + } + fn can_match(&self, state: &IntersectionState) -> bool { + self.0.can_match(&state.0) && self.1.can_match(&state.1) + } + fn will_always_match(&self, state: &IntersectionState) -> bool { + self.0.will_always_match(&state.0) && self.1.will_always_match(&state.1) + } + fn accept(&self, state: &IntersectionState, byte: u8) -> IntersectionState { + IntersectionState(self.0.accept(&state.0, byte), self.1.accept(&state.1, byte)) + } +} +/// An automaton that matches exactly when the automaton it wraps does not. +#[derive(Clone, Debug)] +pub struct Complement(pub A); + +/// The `Automaton` state for `Complement`. +pub struct ComplementState(pub A::State); + +impl Clone for ComplementState +where + A::State: Clone, +{ + fn clone(&self) -> Self { + Self(self.0.clone()) + } +} + +impl Automaton for Complement { + type State = ComplementState; + fn start(&self) -> ComplementState { + ComplementState(self.0.start()) + } + fn is_match(&self, state: &ComplementState) -> bool { + !self.0.is_match(&state.0) + } + fn can_match(&self, state: &ComplementState) -> bool { + !self.0.will_always_match(&state.0) + } + fn will_always_match(&self, state: &ComplementState) -> bool { + !self.0.can_match(&state.0) + } + fn accept(&self, state: &ComplementState, byte: u8) -> ComplementState { + ComplementState(self.0.accept(&state.0, byte)) + } +} diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index bfe5e023c..40e4bca24 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -16,6 +16,7 @@ use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; pub use self::facet::{FacetDistribution, FacetNumberIter, Filter}; +use self::fst_utils::{Complement, Intersection, StartsWith, Union}; pub use self::matching_words::MatchingWords; use self::query_tree::QueryTreeBuilder; use crate::error::UserError; @@ -30,6 +31,7 @@ static LEVDIST2: Lazy = Lazy::new(|| LevBuilder::new(2, true)); mod criteria; mod distinct; mod facet; +mod fst_utils; mod matching_words; mod query_tree; @@ -70,7 +72,6 @@ impl<'a> Search<'a> { pub fn offset(&mut self, offset: usize) -> &mut Search<'a> { self.offset = offset; - self } @@ -301,8 +302,9 @@ pub fn word_derivations<'c>( } else { if max_typo == 1 { let dfa = build_dfa(word, 1, is_prefix); - let starts = Str::new(get_first(word)).starts_with(); - let mut stream = fst.search_with_state(starts.intersection(&dfa)).into_stream(); + let starts = StartsWith(Str::new(get_first(word))); + let mut stream = + fst.search_with_state(Intersection(starts, &dfa)).into_stream(); while let Some((word, state)) = stream.next() { let word = std::str::from_utf8(word)?; @@ -310,11 +312,11 @@ pub fn word_derivations<'c>( derived_words.push((word.to_string(), d.to_u8())); } } else { - let starts = Str::new(get_first(word)).starts_with(); - let first = build_dfa(word, 1, is_prefix).intersection((&starts).complement()); + let starts = StartsWith(Str::new(get_first(word))); + let first = Intersection(build_dfa(word, 1, is_prefix), Complement(&starts)); let second_dfa = build_dfa(word, 2, is_prefix); - let second = (&second_dfa).intersection(&starts); - let automaton = first.union(&second); + let second = Intersection(&second_dfa, &starts); + let automaton = Union(first, &second); let mut stream = fst.search_with_state(automaton).into_stream();