diff --git a/azure-pipelines.yml b/azure-pipelines.yml index d0690e319..0dae15f75 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -13,13 +13,17 @@ jobs: steps: - script: | curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain nightly - displayName: 'Install rustc' + $HOME/.cargo/bin/rustup component add rustfmt + displayName: 'Install rustc and components' - script: | $HOME/.cargo/bin/cargo check displayName: 'Check MeiliDB' - script: | $HOME/.cargo/bin/cargo test displayName: 'Test MeiliDB' + - script: | + $HOME/.cargo/bin/cargo fmt --all -- --check + displayName: 'Fmt MeiliDB' - job: build dependsOn: @@ -31,7 +35,8 @@ jobs: steps: - script: | curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain nightly - displayName: 'Install rustc' + $HOME/.cargo/bin/rustup component add rustfmt + displayName: 'Install rustc and components' - script: | $HOME/.cargo/bin/cargo build --release displayName: 'Build MeiliDB' diff --git a/meilidb-core/examples/from_file.rs b/meilidb-core/examples/from_file.rs index 981f294ff..c04efb73c 100644 --- a/meilidb-core/examples/from_file.rs +++ b/meilidb-core/examples/from_file.rs @@ -4,15 +4,15 @@ use std::error::Error; use std::io::Write; use std::iter::FromIterator; use std::path::{Path, PathBuf}; -use std::time::{Instant, Duration}; +use std::time::{Duration, Instant}; use std::{fs, io, sync::mpsc}; -use rustyline::{Editor, Config}; -use serde::{Serialize, Deserialize}; +use rustyline::{Config, Editor}; +use serde::{Deserialize, Serialize}; use structopt::StructOpt; use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor}; -use meilidb_core::{Highlight, Database, UpdateResult}; +use meilidb_core::{Database, Highlight, UpdateResult}; use meilidb_schema::SchemaAttr; const INDEX_NAME: &str = "default"; @@ -91,7 +91,7 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box index, - None => database.create_index(INDEX_NAME).unwrap() + None => database.create_index(INDEX_NAME).unwrap(), }; let done = database.set_update_callback(INDEX_NAME, Box::new(update_fn)); @@ -108,14 +108,14 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box { if current_schema != schema { - return Err(meilidb_core::Error::SchemaDiffer.into()) + return Err(meilidb_core::Error::SchemaDiffer.into()); } writer.abort(); - }, + } None => { index.schema_update(&mut writer, schema)?; writer.commit().unwrap(); - }, + } } let mut rdr = csv::Reader::from_path(command.csv_data_path)?; @@ -131,7 +131,9 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box document, @@ -147,7 +149,9 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box Result<(), Box io::Result<()> { let mut highlighted = false; for range in ranges.windows(2) { - let [start, end] = match range { [start, end] => [*start, *end], _ => unreachable!() }; + let [start, end] = match range { + [start, end] => [*start, *end], + _ => unreachable!(), + }; if highlighted { stdout.set_color(ColorSpec::new().set_fg(Some(Color::Yellow)))?; } @@ -221,12 +238,14 @@ fn create_highlight_areas(text: &str, highlights: &[Highlight]) -> Vec { let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text); match byte_indexes.entry(byte_index) { - Entry::Vacant(entry) => { entry.insert(byte_length); }, + Entry::Vacant(entry) => { + entry.insert(byte_length); + } Entry::Occupied(mut entry) => { if *entry.get() < byte_length { entry.insert(byte_length); } - }, + } } } @@ -252,22 +271,23 @@ fn create_highlight_areas(text: &str, highlights: &[Highlight]) -> Vec { /// ``` fn crop_text( text: &str, - highlights: impl IntoIterator, + highlights: impl IntoIterator, context: usize, -) -> (String, Vec) -{ +) -> (String, Vec) { let mut highlights = highlights.into_iter().peekable(); - let char_index = highlights.peek().map(|m| m.char_index as usize).unwrap_or(0); + let char_index = highlights + .peek() + .map(|m| m.char_index as usize) + .unwrap_or(0); let start = char_index.saturating_sub(context); let text = text.chars().skip(start).take(context * 2).collect(); let highlights = highlights - .take_while(|m| { - (m.char_index as usize) + (m.char_length as usize) <= start + (context * 2) - }) - .map(|highlight| { - Highlight { char_index: highlight.char_index - start as u16, ..highlight } + .take_while(|m| (m.char_index as usize) + (m.char_length as usize) <= start + (context * 2)) + .map(|highlight| Highlight { + char_index: highlight.char_index - start as u16, + ..highlight }) .collect(); @@ -276,7 +296,9 @@ fn crop_text( fn search_command(command: SearchCommand, database: Database) -> Result<(), Box> { let env = &database.env; - let index = database.open_index(INDEX_NAME).expect("Could not find index"); + let index = database + .open_index(INDEX_NAME) + .expect("Could not find index"); let reader = env.read_txn().unwrap(); let schema = index.main.schema(&reader)?; @@ -312,10 +334,15 @@ fn search_command(command: SearchCommand, database: Database) -> Result<(), Box< (true, filter) }; - let attr = schema.attribute(&filter).expect("Could not find filtered attribute"); + let attr = schema + .attribute(&filter) + .expect("Could not find filtered attribute"); builder.with_filter(move |document_id| { - let string: String = ref_index.document_attribute(ref_reader, document_id, attr).unwrap().unwrap(); + let string: String = ref_index + .document_attribute(ref_reader, document_id, attr) + .unwrap() + .unwrap(); (string == "true") == positive }); } @@ -326,8 +353,8 @@ fn search_command(command: SearchCommand, database: Database) -> Result<(), Box< let number_of_documents = documents.len(); for mut doc in documents { - - doc.highlights.sort_unstable_by_key(|m| (m.char_index, m.char_length)); + doc.highlights + .sort_unstable_by_key(|m| (m.char_index, m.char_length)); let start_retrieve = Instant::now(); let result = index.document::(&reader, Some(&fields), doc.id); @@ -340,15 +367,18 @@ fn search_command(command: SearchCommand, database: Database) -> Result<(), Box< print!("{}: ", name); let attr = schema.attribute(&name).unwrap(); - let highlights = doc.highlights.iter() - .filter(|m| SchemaAttr::new(m.attribute) == attr) - .cloned(); - let (text, highlights) = crop_text(&text, highlights, command.char_context); + let highlights = doc + .highlights + .iter() + .filter(|m| SchemaAttr::new(m.attribute) == attr) + .cloned(); + let (text, highlights) = + crop_text(&text, highlights, command.char_context); let areas = create_highlight_areas(&text, &highlights); display_highlights(&text, &areas)?; println!(); } - }, + } Ok(None) => eprintln!("missing document"), Err(e) => eprintln!("{}", e), } @@ -366,12 +396,19 @@ fn search_command(command: SearchCommand, database: Database) -> Result<(), Box< println!(); } - eprintln!("whole documents fields retrieve took {:.2?}", retrieve_duration); - eprintln!("===== Found {} results in {:.2?} =====", number_of_documents, start_total.elapsed()); - }, + eprintln!( + "whole documents fields retrieve took {:.2?}", + retrieve_duration + ); + eprintln!( + "===== Found {} results in {:.2?} =====", + number_of_documents, + start_total.elapsed() + ); + } Err(err) => { println!("Error: {:?}", err); - break + break; } } } diff --git a/meilidb-core/src/automaton/dfa.rs b/meilidb-core/src/automaton/dfa.rs index eb9ff0714..6258da424 100644 --- a/meilidb-core/src/automaton/dfa.rs +++ b/meilidb-core/src/automaton/dfa.rs @@ -1,8 +1,5 @@ +use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA}; use once_cell::sync::OnceCell; -use levenshtein_automata::{ - LevenshteinAutomatonBuilder as LevBuilder, - DFA, -}; static LEVDIST0: OnceCell = OnceCell::new(); static LEVDIST1: OnceCell = OnceCell::new(); @@ -15,30 +12,30 @@ enum PrefixSetting { } fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DFA { - use PrefixSetting::{Prefix, NoPrefix}; + use PrefixSetting::{NoPrefix, Prefix}; match query.len() { - 0 ..= 4 => { + 0..=4 => { let builder = LEVDIST0.get_or_init(|| LevBuilder::new(0, true)); match setting { - Prefix => builder.build_prefix_dfa(query), + Prefix => builder.build_prefix_dfa(query), NoPrefix => builder.build_dfa(query), } - }, - 5 ..= 8 => { + } + 5..=8 => { let builder = LEVDIST1.get_or_init(|| LevBuilder::new(1, true)); match setting { - Prefix => builder.build_prefix_dfa(query), + Prefix => builder.build_prefix_dfa(query), NoPrefix => builder.build_dfa(query), } - }, + } _ => { let builder = LEVDIST2.get_or_init(|| LevBuilder::new(2, true)); match setting { - Prefix => builder.build_prefix_dfa(query), + Prefix => builder.build_prefix_dfa(query), NoPrefix => builder.build_dfa(query), } - }, + } } } diff --git a/meilidb-core/src/automaton/mod.rs b/meilidb-core/src/automaton/mod.rs index 0f7e3137d..da398c669 100644 --- a/meilidb-core/src/automaton/mod.rs +++ b/meilidb-core/src/automaton/mod.rs @@ -6,14 +6,14 @@ use std::vec; use fst::{IntoStreamer, Streamer}; use levenshtein_automata::DFA; -use meilidb_tokenizer::{split_query_string, is_cjk}; +use meilidb_tokenizer::{is_cjk, split_query_string}; -use crate::store; use crate::error::MResult; +use crate::store; use self::dfa::{build_dfa, build_prefix_dfa}; -use self::query_enhancer::QueryEnhancerBuilder; pub use self::query_enhancer::QueryEnhancer; +use self::query_enhancer::QueryEnhancerBuilder; const NGRAMS: usize = 3; @@ -27,14 +27,9 @@ impl AutomatonProducer { query: &str, main_store: store::Main, synonyms_store: store::Synonyms, - ) -> MResult<(AutomatonProducer, QueryEnhancer)> - { - let (automatons, query_enhancer) = generate_automatons( - reader, - query, - main_store, - synonyms_store, - )?; + ) -> MResult<(AutomatonProducer, QueryEnhancer)> { + let (automatons, query_enhancer) = + generate_automatons(reader, query, main_store, synonyms_store)?; Ok((AutomatonProducer { automatons }, query_enhancer)) } @@ -112,8 +107,7 @@ fn generate_automatons( query: &str, main_store: store::Main, synonym_store: store::Synonyms, -) -> MResult<(Vec>, QueryEnhancer)> -{ +) -> MResult<(Vec>, QueryEnhancer)> { let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect(); let synonyms = match main_store.synonyms_fst(reader)? { @@ -130,7 +124,6 @@ fn generate_automatons( let mut original_automatons = Vec::new(); let mut original_words = query_words.iter().peekable(); while let Some(word) = original_words.next() { - let has_following_word = original_words.peek().is_some(); let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk); @@ -148,29 +141,33 @@ fn generate_automatons( for n in 1..=NGRAMS { let mut ngrams = query_words.windows(n).enumerate().peekable(); while let Some((query_index, ngram_slice)) = ngrams.next() { - let query_range = query_index..query_index + n; let ngram_nb_words = ngram_slice.len(); let ngram = ngram_slice.join(" "); let has_following_word = ngrams.peek().is_some(); - let not_prefix_dfa = has_following_word || has_end_whitespace || ngram.chars().all(is_cjk); + let not_prefix_dfa = + has_following_word || has_end_whitespace || ngram.chars().all(is_cjk); // automaton of synonyms of the ngrams let normalized = normalize_str(&ngram); - let lev = if not_prefix_dfa { build_dfa(&normalized) } else { build_prefix_dfa(&normalized) }; + let lev = if not_prefix_dfa { + build_dfa(&normalized) + } else { + build_prefix_dfa(&normalized) + }; let mut stream = synonyms.search(&lev).into_stream(); while let Some(base) = stream.next() { - // only trigger alternatives when the last word has been typed // i.e. "new " do not but "new yo" triggers alternatives to "new york" let base = std::str::from_utf8(base).unwrap(); let base_nb_words = split_query_string(base).count(); - if ngram_nb_words != base_nb_words { continue } + if ngram_nb_words != base_nb_words { + continue; + } if let Some(synonyms) = synonym_store.synonyms(reader, base.as_bytes())? { - let mut stream = synonyms.into_stream(); while let Some(synonyms) = stream.next() { let synonyms = std::str::from_utf8(synonyms).unwrap(); @@ -178,7 +175,11 @@ fn generate_automatons( let nb_synonym_words = synonyms_words.len(); let real_query_index = automaton_index; - enhancer_builder.declare(query_range.clone(), real_query_index, &synonyms_words); + enhancer_builder.declare( + query_range.clone(), + real_query_index, + &synonyms_words, + ); for synonym in synonyms_words { let automaton = if nb_synonym_words == 1 { diff --git a/meilidb-core/src/automaton/query_enhancer.rs b/meilidb-core/src/automaton/query_enhancer.rs index 165c1b094..2194f3ff1 100644 --- a/meilidb-core/src/automaton/query_enhancer.rs +++ b/meilidb-core/src/automaton/query_enhancer.rs @@ -1,5 +1,5 @@ +use std::cmp::Ordering::{Equal, Greater, Less}; use std::ops::Range; -use std::cmp::Ordering::{Less, Greater, Equal}; /// Return `true` if the specified range can accept the given replacements words. /// Returns `false` if the replacements words are already present in the original query @@ -34,13 +34,14 @@ use std::cmp::Ordering::{Less, Greater, Equal}; // [new york city] // fn rewrite_range_with(query: &[S], range: Range, words: &[T]) -> bool -where S: AsRef, - T: AsRef, +where + S: AsRef, + T: AsRef, { if words.len() <= range.len() { // there is fewer or equal replacement words // than there is already in the replaced range - return false + return false; } // retrieve the part to rewrite but with the length @@ -49,7 +50,9 @@ where S: AsRef, // check if the original query doesn't already contain // the replacement words - !original.map(AsRef::as_ref).eq(words.iter().map(AsRef::as_ref)) + !original + .map(AsRef::as_ref) + .eq(words.iter().map(AsRef::as_ref)) } type Origin = usize; @@ -68,11 +71,20 @@ impl FakeIntervalTree { fn query(&self, point: usize) -> Option<(Range, (Origin, RealLength))> { let element = self.intervals.binary_search_by(|(r, _)| { if point >= r.start { - if point < r.end { Equal } else { Less } - } else { Greater } + if point < r.end { + Equal + } else { + Less + } + } else { + Greater + } }); - let n = match element { Ok(n) => n, Err(n) => n }; + let n = match element { + Ok(n) => n, + Err(n) => n, + }; match self.intervals.get(n) { Some((range, value)) if range.contains(&point) => Some((range.clone(), *value)), @@ -90,10 +102,14 @@ pub struct QueryEnhancerBuilder<'a, S> { impl> QueryEnhancerBuilder<'_, S> { pub fn new(query: &[S]) -> QueryEnhancerBuilder { // we initialize origins query indices based on their positions - let origins: Vec<_> = (0..query.len() + 1).collect(); - let real_to_origin = origins.iter().map(|&o| (o..o+1, (o, 1))).collect(); + let origins: Vec<_> = (0..=query.len()).collect(); + let real_to_origin = origins.iter().map(|&o| (o..o + 1, (o, 1))).collect(); - QueryEnhancerBuilder { query, origins, real_to_origin } + QueryEnhancerBuilder { + query, + origins, + real_to_origin, + } } /// Update the final real to origin query indices mapping. @@ -101,12 +117,12 @@ impl> QueryEnhancerBuilder<'_, S> { /// `range` is the original words range that this `replacement` words replace /// and `real` is the first real query index of these replacement words. pub fn declare(&mut self, range: Range, real: usize, replacement: &[T]) - where T: AsRef, + where + T: AsRef, { // check if the range of original words // can be rewritten with the replacement words if rewrite_range_with(self.query, range.clone(), replacement) { - // this range can be replaced so we need to // modify the origins accordingly let offset = replacement.len() - range.len(); @@ -126,7 +142,8 @@ impl> QueryEnhancerBuilder<'_, S> { // we need to pad real query indices let real_range = real..real + replacement.len().max(range.len()); let real_length = replacement.len(); - self.real_to_origin.push((real_range, (range.start, real_length))); + self.real_to_origin + .push((real_range, (range.start, real_length))); } pub fn build(self) -> QueryEnhancer { @@ -148,10 +165,10 @@ impl QueryEnhancer { let real = real as usize; // query the fake interval tree with the real query index - let (range, (origin, real_length)) = - self.real_to_origin - .query(real) - .expect("real has never been declared"); + let (range, (origin, real_length)) = self + .real_to_origin + .query(real) + .expect("real has never been declared"); // if `real` is the end bound of the range if (range.start + real_length - 1) == real { @@ -160,7 +177,10 @@ impl QueryEnhancer { for (i, slice) in self.origins[new_origin..].windows(2).enumerate() { let len = slice[1] - slice[0]; count = count.saturating_sub(len); - if count == 0 { new_origin = origin + i; break } + if count == 0 { + new_origin = origin + i; + break; + } } let n = real - range.start; @@ -168,15 +188,20 @@ impl QueryEnhancer { let end = self.origins[new_origin + 1]; let remaining = (end - start) - n; - Range { start: (start + n) as u32, end: (start + n + remaining) as u32 } - + Range { + start: (start + n) as u32, + end: (start + n + remaining) as u32, + } } else { // just return the origin along with // the real position of the word let n = real as usize - range.start; let origin = self.origins[origin]; - Range { start: (origin + n) as u32, end: (origin + n + 1) as u32 } + Range { + start: (origin + n) as u32, + end: (origin + n + 1) as u32, + } } } } @@ -382,16 +407,16 @@ mod tests { let enhancer = builder.build(); - assert_eq!(enhancer.replacement(0), 0..1); // great - assert_eq!(enhancer.replacement(1), 1..2); // awesome - assert_eq!(enhancer.replacement(2), 2..5); // NYC - assert_eq!(enhancer.replacement(3), 5..7); // subway - assert_eq!(enhancer.replacement(4), 2..3); // new - assert_eq!(enhancer.replacement(5), 3..4); // york - assert_eq!(enhancer.replacement(6), 4..5); // city - assert_eq!(enhancer.replacement(7), 5..6); // underground - assert_eq!(enhancer.replacement(8), 6..7); // train - assert_eq!(enhancer.replacement(9), 0..2); // good + assert_eq!(enhancer.replacement(0), 0..1); // great + assert_eq!(enhancer.replacement(1), 1..2); // awesome + assert_eq!(enhancer.replacement(2), 2..5); // NYC + assert_eq!(enhancer.replacement(3), 5..7); // subway + assert_eq!(enhancer.replacement(4), 2..3); // new + assert_eq!(enhancer.replacement(5), 3..4); // york + assert_eq!(enhancer.replacement(6), 4..5); // city + assert_eq!(enhancer.replacement(7), 5..6); // underground + assert_eq!(enhancer.replacement(8), 6..7); // train + assert_eq!(enhancer.replacement(9), 0..2); // good assert_eq!(enhancer.replacement(10), 1..5); // NY assert_eq!(enhancer.replacement(11), 2..5); // metro } diff --git a/meilidb-core/src/criterion/document_id.rs b/meilidb-core/src/criterion/document_id.rs index 15549da24..e4a402d26 100644 --- a/meilidb-core/src/criterion/document_id.rs +++ b/meilidb-core/src/criterion/document_id.rs @@ -1,6 +1,6 @@ -use std::cmp::Ordering; use crate::criterion::Criterion; use crate::RawDocument; +use std::cmp::Ordering; #[derive(Debug, Clone, Copy)] pub struct DocumentId; diff --git a/meilidb-core/src/criterion/exact.rs b/meilidb-core/src/criterion/exact.rs index 94c4d2746..3f4c4a974 100644 --- a/meilidb-core/src/criterion/exact.rs +++ b/meilidb-core/src/criterion/exact.rs @@ -1,8 +1,8 @@ use std::cmp::Ordering; +use meilidb_schema::SchemaAttr; use sdset::Set; use slice_group_by::GroupBy; -use meilidb_schema::SchemaAttr; use crate::criterion::Criterion; use crate::RawDocument; @@ -13,8 +13,7 @@ fn number_exact_matches( attribute: &[u16], is_exact: &[bool], fields_counts: &Set<(SchemaAttr, u64)>, -) -> usize -{ +) -> usize { let mut count = 0; let mut index = 0; @@ -22,12 +21,16 @@ fn number_exact_matches( let len = group.len(); let mut found_exact = false; - for (pos, _) in is_exact[index..index + len].iter().filter(|x| **x).enumerate() { + for (pos, _) in is_exact[index..index + len] + .iter() + .filter(|x| **x) + .enumerate() + { found_exact = true; if let Ok(pos) = fields_counts.binary_search_by_key(&attribute[pos], |(a, _)| a.0) { let (_, count) = fields_counts[pos]; if count == 1 { - return usize::max_value() + return usize::max_value(); } } } @@ -81,18 +84,18 @@ mod tests { #[test] fn easy_case() { let doc0 = { - let query_index = &[0]; - let attribute = &[0]; - let is_exact = &[true]; + let query_index = &[0]; + let attribute = &[0]; + let is_exact = &[true]; let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap(); number_exact_matches(query_index, attribute, is_exact, fields_counts) }; let doc1 = { - let query_index = &[0]; - let attribute = &[0]; - let is_exact = &[false]; + let query_index = &[0]; + let attribute = &[0]; + let is_exact = &[false]; let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap(); number_exact_matches(query_index, attribute, is_exact, fields_counts) @@ -108,18 +111,18 @@ mod tests { #[test] fn basic() { let doc0 = { - let query_index = &[0]; - let attribute = &[0]; - let is_exact = &[true]; + let query_index = &[0]; + let attribute = &[0]; + let is_exact = &[true]; let fields_counts = Set::new(&[(SchemaAttr(0), 1)]).unwrap(); number_exact_matches(query_index, attribute, is_exact, fields_counts) }; let doc1 = { - let query_index = &[0]; - let attribute = &[0]; - let is_exact = &[true]; + let query_index = &[0]; + let attribute = &[0]; + let is_exact = &[true]; let fields_counts = Set::new(&[(SchemaAttr(0), 4)]).unwrap(); number_exact_matches(query_index, attribute, is_exact, fields_counts) diff --git a/meilidb-core/src/criterion/mod.rs b/meilidb-core/src/criterion/mod.rs index ad02d3023..e94b1b2c7 100644 --- a/meilidb-core/src/criterion/mod.rs +++ b/meilidb-core/src/criterion/mod.rs @@ -1,24 +1,20 @@ -mod sum_of_typos; +mod document_id; +mod exact; mod number_of_words; -mod words_proximity; +mod sort_by_attr; +mod sum_of_typos; mod sum_of_words_attribute; mod sum_of_words_position; -mod exact; -mod sort_by_attr; -mod document_id; +mod words_proximity; -use std::cmp::Ordering; use crate::RawDocument; +use std::cmp::Ordering; pub use self::{ - sum_of_typos::SumOfTypos, - number_of_words::NumberOfWords, + document_id::DocumentId, exact::Exact, number_of_words::NumberOfWords, + sort_by_attr::SortByAttr, sum_of_typos::SumOfTypos, + sum_of_words_attribute::SumOfWordsAttribute, sum_of_words_position::SumOfWordsPosition, words_proximity::WordsProximity, - sum_of_words_attribute::SumOfWordsAttribute, - sum_of_words_position::SumOfWordsPosition, - exact::Exact, - sort_by_attr::SortByAttr, - document_id::DocumentId, }; pub trait Criterion: Send + Sync { @@ -62,17 +58,18 @@ impl Criterion for Box { #[derive(Default)] pub struct CriteriaBuilder<'a> { - inner: Vec> + inner: Vec>, } -impl<'a> CriteriaBuilder<'a> -{ +impl<'a> CriteriaBuilder<'a> { pub fn new() -> CriteriaBuilder<'a> { CriteriaBuilder { inner: Vec::new() } } pub fn with_capacity(capacity: usize) -> CriteriaBuilder<'a> { - CriteriaBuilder { inner: Vec::with_capacity(capacity) } + CriteriaBuilder { + inner: Vec::with_capacity(capacity), + } } pub fn reserve(&mut self, additional: usize) { @@ -80,14 +77,16 @@ impl<'a> CriteriaBuilder<'a> } pub fn add(mut self, criterion: C) -> CriteriaBuilder<'a> - where C: Criterion, + where + C: Criterion, { self.push(criterion); self } pub fn push(&mut self, criterion: C) - where C: Criterion, + where + C: Criterion, { self.inner.push(Box::new(criterion)); } diff --git a/meilidb-core/src/criterion/number_of_words.rs b/meilidb-core/src/criterion/number_of_words.rs index 641385fb1..6c1218e2f 100644 --- a/meilidb-core/src/criterion/number_of_words.rs +++ b/meilidb-core/src/criterion/number_of_words.rs @@ -1,7 +1,7 @@ -use std::cmp::Ordering; -use slice_group_by::GroupBy; use crate::criterion::Criterion; use crate::RawDocument; +use slice_group_by::GroupBy; +use std::cmp::Ordering; #[inline] fn number_of_query_words(query_index: &[u32]) -> usize { diff --git a/meilidb-core/src/criterion/sort_by_attr.rs b/meilidb-core/src/criterion/sort_by_attr.rs index c19062dd6..632a4657c 100644 --- a/meilidb-core/src/criterion/sort_by_attr.rs +++ b/meilidb-core/src/criterion/sort_by_attr.rs @@ -2,9 +2,9 @@ use std::cmp::Ordering; use std::error::Error; use std::fmt; -use meilidb_schema::{Schema, SchemaAttr}; use crate::criterion::Criterion; -use crate::{RawDocument, RankedMap}; +use crate::{RankedMap, RawDocument}; +use meilidb_schema::{Schema, SchemaAttr}; /// An helper struct that permit to sort documents by /// some of their stored attributes. @@ -51,8 +51,7 @@ impl<'a> SortByAttr<'a> { ranked_map: &'a RankedMap, schema: &Schema, attr_name: &str, - ) -> Result, SortByAttrError> - { + ) -> Result, SortByAttrError> { SortByAttr::new(ranked_map, schema, attr_name, false) } @@ -60,8 +59,7 @@ impl<'a> SortByAttr<'a> { ranked_map: &'a RankedMap, schema: &Schema, attr_name: &str, - ) -> Result, SortByAttrError> - { + ) -> Result, SortByAttrError> { SortByAttr::new(ranked_map, schema, attr_name, true) } @@ -70,8 +68,7 @@ impl<'a> SortByAttr<'a> { schema: &Schema, attr_name: &str, reversed: bool, - ) -> Result, SortByAttrError> - { + ) -> Result, SortByAttrError> { let attr = match schema.attribute(attr_name) { Some(attr) => attr, None => return Err(SortByAttrError::AttributeNotFound), @@ -81,7 +78,11 @@ impl<'a> SortByAttr<'a> { return Err(SortByAttrError::AttributeNotRegisteredForRanking); } - Ok(SortByAttr { ranked_map, attr, reversed }) + Ok(SortByAttr { + ranked_map, + attr, + reversed, + }) } } @@ -93,11 +94,15 @@ impl<'a> Criterion for SortByAttr<'a> { match (lhs, rhs) { (Some(lhs), Some(rhs)) => { let order = lhs.cmp(&rhs); - if self.reversed { order.reverse() } else { order } - }, - (None, Some(_)) => Ordering::Greater, - (Some(_), None) => Ordering::Less, - (None, None) => Ordering::Equal, + if self.reversed { + order.reverse() + } else { + order + } + } + (None, Some(_)) => Ordering::Greater, + (Some(_), None) => Ordering::Less, + (None, None) => Ordering::Equal, } } @@ -122,4 +127,4 @@ impl fmt::Display for SortByAttrError { } } -impl Error for SortByAttrError { } +impl Error for SortByAttrError {} diff --git a/meilidb-core/src/criterion/sum_of_typos.rs b/meilidb-core/src/criterion/sum_of_typos.rs index 9fbf0dab9..5cad73b42 100644 --- a/meilidb-core/src/criterion/sum_of_typos.rs +++ b/meilidb-core/src/criterion/sum_of_typos.rs @@ -11,10 +11,10 @@ use crate::RawDocument; #[inline] fn custom_log10(n: u8) -> f32 { match n { - 0 => 0.0, // log(1) - 1 => 0.30102, // log(2) - 2 => 0.47712, // log(3) - 3 => 0.60205, // log(4) + 0 => 0.0, // log(1) + 1 => 0.30102, // log(2) + 2 => 0.47712, // log(3) + 3 => 0.60205, // log(4) _ => panic!("invalid number"), } } diff --git a/meilidb-core/src/criterion/sum_of_words_attribute.rs b/meilidb-core/src/criterion/sum_of_words_attribute.rs index 2bf052159..472d771b7 100644 --- a/meilidb-core/src/criterion/sum_of_words_attribute.rs +++ b/meilidb-core/src/criterion/sum_of_words_attribute.rs @@ -1,7 +1,7 @@ -use std::cmp::Ordering; -use slice_group_by::GroupBy; use crate::criterion::Criterion; use crate::RawDocument; +use slice_group_by::GroupBy; +use std::cmp::Ordering; #[inline] fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize { diff --git a/meilidb-core/src/criterion/sum_of_words_position.rs b/meilidb-core/src/criterion/sum_of_words_position.rs index d5dd10ab7..70b8843dc 100644 --- a/meilidb-core/src/criterion/sum_of_words_position.rs +++ b/meilidb-core/src/criterion/sum_of_words_position.rs @@ -1,7 +1,7 @@ -use std::cmp::Ordering; -use slice_group_by::GroupBy; use crate::criterion::Criterion; use crate::RawDocument; +use slice_group_by::GroupBy; +use std::cmp::Ordering; #[inline] fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize { diff --git a/meilidb-core/src/criterion/words_proximity.rs b/meilidb-core/src/criterion/words_proximity.rs index ed3775b50..579bc7b8c 100644 --- a/meilidb-core/src/criterion/words_proximity.rs +++ b/meilidb-core/src/criterion/words_proximity.rs @@ -1,7 +1,7 @@ -use std::cmp::{self, Ordering}; -use slice_group_by::GroupBy; use crate::criterion::Criterion; use crate::RawDocument; +use slice_group_by::GroupBy; +use std::cmp::{self, Ordering}; const MAX_DISTANCE: u16 = 8; @@ -19,7 +19,9 @@ fn index_proximity(lhs: u16, rhs: u16) -> u16 { } fn attribute_proximity((lattr, lwi): (u16, u16), (rattr, rwi): (u16, u16)) -> u16 { - if lattr != rattr { return MAX_DISTANCE } + if lattr != rattr { + return MAX_DISTANCE; + } index_proximity(lwi, rwi) } @@ -42,15 +44,18 @@ fn matches_proximity( distance: &[u8], attribute: &[u16], word_index: &[u16], -) -> u16 -{ +) -> u16 { let mut query_index_groups = query_index.linear_group(); let mut proximity = 0; let mut index = 0; let get_attr_wi = |index: usize, group_len: usize| { // retrieve the first distance group (with the lowest values) - let len = distance[index..index + group_len].linear_group().next().unwrap().len(); + let len = distance[index..index + group_len] + .linear_group() + .next() + .unwrap() + .len(); let rattr = &attribute[index..index + len]; let rwi = &word_index[index..index + len]; @@ -110,7 +115,6 @@ mod tests { #[test] fn three_different_attributes() { - // "soup" "of the" "the day" // // { id: 0, attr: 0, attr_index: 0 } @@ -120,19 +124,21 @@ mod tests { // { id: 3, attr: 3, attr_index: 1 } let query_index = &[0, 1, 2, 2, 3]; - let distance = &[0, 0, 0, 0, 0]; - let attribute = &[0, 1, 1, 2, 3]; - let word_index = &[0, 0, 1, 0, 1]; + let distance = &[0, 0, 0, 0, 0]; + let attribute = &[0, 1, 1, 2, 3]; + let word_index = &[0, 0, 1, 0, 1]; // soup -> of = 8 // + of -> the = 1 // + the -> day = 8 (not 1) - assert_eq!(matches_proximity(query_index, distance, attribute, word_index), 17); + assert_eq!( + matches_proximity(query_index, distance, attribute, word_index), + 17 + ); } #[test] fn two_different_attributes() { - // "soup day" "soup of the day" // // { id: 0, attr: 0, attr_index: 0 } @@ -143,13 +149,16 @@ mod tests { // { id: 3, attr: 1, attr_index: 3 } let query_index = &[0, 0, 1, 2, 3, 3]; - let distance = &[0, 0, 0, 0, 0, 0]; - let attribute = &[0, 1, 1, 1, 0, 1]; - let word_index = &[0, 0, 1, 2, 1, 3]; + let distance = &[0, 0, 0, 0, 0, 0]; + let attribute = &[0, 1, 1, 1, 0, 1]; + let word_index = &[0, 0, 1, 2, 1, 3]; // soup -> of = 1 // + of -> the = 1 // + the -> day = 1 - assert_eq!(matches_proximity(query_index, distance, attribute, word_index), 3); + assert_eq!( + matches_proximity(query_index, distance, attribute, word_index), + 3 + ); } } diff --git a/meilidb-core/src/database.rs b/meilidb-core/src/database.rs index 035b7e7a4..64da2d5f9 100644 --- a/meilidb-core/src/database.rs +++ b/meilidb-core/src/database.rs @@ -1,13 +1,13 @@ -use std::collections::hash_map::{HashMap, Entry}; +use std::collections::hash_map::{Entry, HashMap}; use std::fs::File; use std::path::Path; use std::sync::{Arc, RwLock}; use std::{fs, thread}; -use zlmdb::{Result as ZResult, CompactionOption}; -use zlmdb::types::{Str, Unit}; use crossbeam_channel::Receiver; use log::{debug, error}; +use zlmdb::types::{Str, Unit}; +use zlmdb::{CompactionOption, Result as ZResult}; use crate::{store, update, Index, MResult}; @@ -32,20 +32,32 @@ fn update_awaiter( loop { let mut writer = match env.write_txn() { Ok(writer) => writer, - Err(e) => { error!("LMDB writer transaction begin failed: {}", e); break } + Err(e) => { + error!("LMDB writer transaction begin failed: {}", e); + break; + } }; match update::update_task(&mut writer, index.clone()) { Ok(Some(status)) => { - if let Err(e) = writer.commit() { error!("update transaction failed: {}", e) } + if let Err(e) = writer.commit() { + error!("update transaction failed: {}", e) + } if let Some(ref callback) = *update_fn.load() { (callback)(status); } - }, + } // no more updates to handle for now - Ok(None) => { debug!("no more updates"); writer.abort(); break }, - Err(e) => { error!("update task failed: {}", e); writer.abort() }, + Ok(None) => { + debug!("no more updates"); + writer.abort(); + break; + } + Err(e) => { + error!("update task failed: {}", e); + writer.abort() + } } } } @@ -76,14 +88,16 @@ impl Database { // open the previously aggregated indexes let mut indexes = HashMap::new(); for index_name in must_open { - let (sender, receiver) = crossbeam_channel::bounded(100); let index = match store::open(&env, &index_name, sender.clone())? { Some(index) => index, None => { - log::warn!("the index {} doesn't exist or has not all the databases", index_name); + log::warn!( + "the index {} doesn't exist or has not all the databases", + index_name + ); continue; - }, + } }; let update_fn = Arc::new(ArcSwapFn::empty()); @@ -100,10 +114,18 @@ impl Database { sender.send(()).unwrap(); let result = indexes.insert(index_name, (index, update_fn, handle)); - assert!(result.is_none(), "The index should not have been already open"); + assert!( + result.is_none(), + "The index should not have been already open" + ); } - Ok(Database { env, common_store, indexes_store, indexes: RwLock::new(indexes) }) + Ok(Database { + env, + common_store, + indexes_store, + indexes: RwLock::new(indexes), + }) } pub fn open_index(&self, name: impl AsRef) -> Option { @@ -152,7 +174,7 @@ impl Database { let update_fn = Some(Arc::new(update_fn)); current_update_fn.swap(update_fn); true - }, + } None => false, } } @@ -160,7 +182,10 @@ impl Database { pub fn unset_update_callback(&self, name: impl AsRef) -> bool { let indexes_lock = self.indexes.read().unwrap(); match indexes_lock.get(name.as_ref()) { - Some((_, current_update_fn, _)) => { current_update_fn.swap(None); true }, + Some((_, current_update_fn, _)) => { + current_update_fn.swap(None); + true + } None => false, } } diff --git a/meilidb-core/src/distinct_map.rs b/meilidb-core/src/distinct_map.rs index c53ad0ea4..e53592afe 100644 --- a/meilidb-core/src/distinct_map.rs +++ b/meilidb-core/src/distinct_map.rs @@ -1,5 +1,5 @@ -use std::hash::Hash; use hashbrown::HashMap; +use std::hash::Hash; pub struct DistinctMap { inner: HashMap, diff --git a/meilidb-core/src/error.rs b/meilidb-core/src/error.rs index 3523b0ad5..a39164341 100644 --- a/meilidb-core/src/error.rs +++ b/meilidb-core/src/error.rs @@ -1,6 +1,6 @@ -use std::{error, fmt, io}; +use crate::serde::{DeserializerError, SerializerError}; use serde_json::Error as SerdeJsonError; -use crate::serde::{SerializerError, DeserializerError}; +use std::{error, fmt, io}; pub type MResult = Result; @@ -90,7 +90,7 @@ impl fmt::Display for Error { } } -impl error::Error for Error { } +impl error::Error for Error {} #[derive(Debug)] pub enum UnsupportedOperation { diff --git a/meilidb-core/src/lib.rs b/meilidb-core/src/lib.rs index 67ea20762..6beef461e 100644 --- a/meilidb-core/src/lib.rs +++ b/meilidb-core/src/lib.rs @@ -1,7 +1,9 @@ #[cfg(test)] -#[macro_use] extern crate assert_matches; +#[macro_use] +extern crate assert_matches; mod automaton; +pub mod criterion; mod database; mod distinct_map; mod error; @@ -9,31 +11,41 @@ mod number; mod query_builder; mod ranked_map; mod raw_document; -mod reordered_attrs; -mod update; -pub mod criterion; pub mod raw_indexer; +mod reordered_attrs; pub mod serde; pub mod store; +mod update; -pub use self::database::{Database, BoxUpdateFn}; +pub use self::database::{BoxUpdateFn, Database}; pub use self::error::{Error, MResult}; pub use self::number::{Number, ParseNumberError}; pub use self::ranked_map::RankedMap; pub use self::raw_document::RawDocument; pub use self::store::Index; -pub use self::update::{UpdateStatus, UpdateResult, UpdateType}; +pub use self::update::{UpdateResult, UpdateStatus, UpdateType}; +use ::serde::{Deserialize, Serialize}; use zerocopy::{AsBytes, FromBytes}; -use ::serde::{Serialize, Deserialize}; /// Represent an internally generated document unique identifier. /// /// It is used to inform the database the document you want to deserialize. /// Helpful for custom ranking. -#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] -#[derive(Serialize, Deserialize)] -#[derive(AsBytes, FromBytes)] +#[derive( + Debug, + Copy, + Clone, + Eq, + PartialEq, + PartialOrd, + Ord, + Hash, + Serialize, + Deserialize, + AsBytes, + FromBytes, +)] #[repr(C)] pub struct DocumentId(pub u64); @@ -42,8 +54,7 @@ pub struct DocumentId(pub u64); /// /// This is stored in the map, generated at index time, /// extracted and interpreted at search time. -#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] -#[derive(AsBytes, FromBytes)] +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, AsBytes, FromBytes)] #[repr(C)] pub struct DocIndex { /// The document identifier where the word was found. @@ -109,7 +120,10 @@ pub struct Document { impl Document { #[cfg(not(test))] fn from_raw(raw: RawDocument) -> Document { - Document { id: raw.id, highlights: raw.highlights } + Document { + id: raw.id, + highlights: raw.highlights, + } } #[cfg(test)] @@ -134,7 +148,11 @@ impl Document { matches.push(match_); } - Document { id: raw.id, matches, highlights: raw.highlights } + Document { + id: raw.id, + matches, + highlights: raw.highlights, + } } } diff --git a/meilidb-core/src/number.rs b/meilidb-core/src/number.rs index 5e64cc78f..ff3419008 100644 --- a/meilidb-core/src/number.rs +++ b/meilidb-core/src/number.rs @@ -1,12 +1,11 @@ -use std::num::{ParseIntError, ParseFloatError}; -use std::str::FromStr; use std::fmt; +use std::num::{ParseFloatError, ParseIntError}; +use std::str::FromStr; use ordered_float::OrderedFloat; -use serde::{Serialize, Deserialize}; +use serde::{Deserialize, Serialize}; -#[derive(Serialize, Deserialize)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Serialize, Deserialize, Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum Number { Unsigned(u64), Signed(i64), @@ -32,7 +31,11 @@ impl FromStr for Number { Err(error) => error, }; - Err(ParseNumberError { uint_error, int_error, float_error }) + Err(ParseNumberError { + uint_error, + int_error, + float_error, + }) } } @@ -46,10 +49,17 @@ pub struct ParseNumberError { impl fmt::Display for ParseNumberError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { if self.uint_error == self.int_error { - write!(f, "can not parse number: {}, {}", self.uint_error, self.float_error) + write!( + f, + "can not parse number: {}, {}", + self.uint_error, self.float_error + ) } else { - write!(f, "can not parse number: {}, {}, {}", - self.uint_error, self.int_error, self.float_error) + write!( + f, + "can not parse number: {}, {}, {}", + self.uint_error, self.int_error, self.float_error + ) } } } diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index 26a366c49..3d0769ec3 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -2,17 +2,17 @@ use hashbrown::HashMap; use std::mem; use std::ops::Range; use std::rc::Rc; -use std::time::{Instant, Duration}; +use std::time::{Duration, Instant}; use fst::{IntoStreamer, Streamer}; use sdset::SetBuf; use slice_group_by::{GroupBy, GroupByMut}; use crate::automaton::{Automaton, AutomatonProducer, QueryEnhancer}; -use crate::distinct_map::{DistinctMap, BufferedDistinctMap}; -use crate::raw_document::{RawDocument, raw_documents_from}; -use crate::{Document, DocumentId, Highlight, TmpMatch, criterion::Criteria}; -use crate::{store, MResult, reordered_attrs::ReorderedAttrs}; +use crate::distinct_map::{BufferedDistinctMap, DistinctMap}; +use crate::raw_document::{raw_documents_from, RawDocument}; +use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch}; +use crate::{reordered_attrs::ReorderedAttrs, store, MResult}; pub struct QueryBuilder<'c, 'f, 'd> { criteria: Criteria<'c>, @@ -29,8 +29,7 @@ pub struct QueryBuilder<'c, 'f, 'd> { fn multiword_rewrite_matches( mut matches: Vec<(DocumentId, TmpMatch)>, query_enhancer: &QueryEnhancer, -) -> SetBuf<(DocumentId, TmpMatch)> -{ +) -> SetBuf<(DocumentId, TmpMatch)> { let mut padded_matches = Vec::with_capacity(matches.len()); // we sort the matches by word index to make them rewritable @@ -38,7 +37,6 @@ fn multiword_rewrite_matches( // for each attribute of each document for same_document_attribute in matches.linear_group_by_key(|(id, m)| (*id, m.attribute)) { - // padding will only be applied // to word indices in the same attribute let mut padding = 0; @@ -47,18 +45,20 @@ fn multiword_rewrite_matches( // for each match at the same position // in this document attribute while let Some(same_word_index) = iter.next() { - // find the biggest padding let mut biggest = 0; for (id, match_) in same_word_index { - let mut replacement = query_enhancer.replacement(match_.query_index); let replacement_len = replacement.len(); let nexts = iter.remainder().linear_group_by_key(|(_, m)| m.word_index); if let Some(query_index) = replacement.next() { let word_index = match_.word_index + padding as u16; - let match_ = TmpMatch { query_index, word_index, ..match_.clone() }; + let match_ = TmpMatch { + query_index, + word_index, + ..*match_ + }; padded_matches.push((*id, match_)); } @@ -67,22 +67,30 @@ fn multiword_rewrite_matches( // look ahead and if there already is a match // corresponding to this padding word, abort the padding 'padding: for (x, next_group) in nexts.enumerate() { - for (i, query_index) in replacement.clone().enumerate().skip(x) { let word_index = match_.word_index + padding as u16 + (i + 1) as u16; - let padmatch = TmpMatch { query_index, word_index, ..match_.clone() }; + let padmatch = TmpMatch { + query_index, + word_index, + ..*match_ + }; for (_, nmatch_) in next_group { let mut rep = query_enhancer.replacement(nmatch_.query_index); let query_index = rep.next().unwrap(); if query_index == padmatch.query_index { - if !found { // if we find a corresponding padding for the // first time we must push preceding paddings - for (i, query_index) in replacement.clone().enumerate().take(i) { - let word_index = match_.word_index + padding as u16 + (i + 1) as u16; - let match_ = TmpMatch { query_index, word_index, ..match_.clone() }; + for (i, query_index) in replacement.clone().enumerate().take(i) + { + let word_index = + match_.word_index + padding as u16 + (i + 1) as u16; + let match_ = TmpMatch { + query_index, + word_index, + ..*match_ + }; padded_matches.push((*id, match_)); biggest = biggest.max(i + 1); } @@ -97,7 +105,7 @@ fn multiword_rewrite_matches( // if we do not find a corresponding padding in the // next groups so stop here and pad what was found - break + break; } if !found { @@ -105,7 +113,11 @@ fn multiword_rewrite_matches( // we must insert the entire padding for (i, query_index) in replacement.enumerate() { let word_index = match_.word_index + padding as u16 + (i + 1) as u16; - let match_ = TmpMatch { query_index, word_index, ..match_.clone() }; + let match_ = TmpMatch { + query_index, + word_index, + ..*match_ + }; padded_matches.push((*id, match_)); } @@ -129,16 +141,20 @@ fn fetch_raw_documents( automatons: &[Automaton], query_enhancer: &QueryEnhancer, searchables: Option<&ReorderedAttrs>, - main_store: &store::Main, - postings_lists_store: &store::PostingsLists, - documents_fields_counts_store: &store::DocumentsFieldsCounts, -) -> MResult> -{ + main_store: store::Main, + postings_lists_store: store::PostingsLists, + documents_fields_counts_store: store::DocumentsFieldsCounts, +) -> MResult> { let mut matches = Vec::new(); let mut highlights = Vec::new(); for automaton in automatons { - let Automaton { index, is_exact, query_len, .. } = automaton; + let Automaton { + index, + is_exact, + query_len, + .. + } = automaton; let dfa = automaton.dfa(); let words = match main_store.words_fst(reader)? { @@ -210,8 +226,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { postings_lists: store::PostingsLists, documents_fields_counts: store::DocumentsFieldsCounts, synonyms: store::Synonyms, - ) -> QueryBuilder<'c, 'f, 'd> - { + ) -> QueryBuilder<'c, 'f, 'd> { QueryBuilder::with_criteria( main, postings_lists, @@ -227,8 +242,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { documents_fields_counts: store::DocumentsFieldsCounts, synonyms: store::Synonyms, criteria: Criteria<'c>, - ) -> QueryBuilder<'c, 'f, 'd> - { + ) -> QueryBuilder<'c, 'f, 'd> { QueryBuilder { criteria, searchable_attrs: None, @@ -245,7 +259,8 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { pub fn with_filter(&mut self, function: F) - where F: Fn(DocumentId) -> bool + 'f, + where + F: Fn(DocumentId) -> bool + 'f, { self.filter = Some(Box::new(function)) } @@ -255,13 +270,16 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { } pub fn with_distinct(&mut self, function: F, size: usize) - where F: Fn(DocumentId) -> Option + 'd, + where + F: Fn(DocumentId) -> Option + 'd, { self.distinct = Some((Box::new(function), size)) } pub fn add_searchable_attribute(&mut self, attribute: u16) { - let reorders = self.searchable_attrs.get_or_insert_with(ReorderedAttrs::new); + let reorders = self + .searchable_attrs + .get_or_insert_with(ReorderedAttrs::new); reorders.insert_attribute(attribute); } @@ -270,41 +288,36 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { reader: &zlmdb::RoTxn, query: &str, range: Range, - ) -> MResult> - { + ) -> MResult> { match self.distinct { - Some((distinct, distinct_size)) => { - raw_query_with_distinct( - reader, - query, - range, - self.filter, - distinct, - distinct_size, - self.timeout, - self.criteria, - self.searchable_attrs, - self.main_store, - self.postings_lists_store, - self.documents_fields_counts_store, - self.synonyms_store, - ) - }, - None => { - raw_query( - reader, - query, - range, - self.filter, - self.timeout, - self.criteria, - self.searchable_attrs, - self.main_store, - self.postings_lists_store, - self.documents_fields_counts_store, - self.synonyms_store, - ) - } + Some((distinct, distinct_size)) => raw_query_with_distinct( + reader, + query, + range, + self.filter, + distinct, + distinct_size, + self.timeout, + self.criteria, + self.searchable_attrs, + self.main_store, + self.postings_lists_store, + self.documents_fields_counts_store, + self.synonyms_store, + ), + None => raw_query( + reader, + query, + range, + self.filter, + self.timeout, + self.criteria, + self.searchable_attrs, + self.main_store, + self.postings_lists_store, + self.documents_fields_counts_store, + self.synonyms_store, + ), } } } @@ -326,7 +339,8 @@ fn raw_query<'c, FI>( documents_fields_counts_store: store::DocumentsFieldsCounts, synonyms_store: store::Synonyms, ) -> MResult> -where FI: Fn(DocumentId) -> bool, +where + FI: Fn(DocumentId) -> bool, { // We delegate the filter work to the distinct query builder, // specifying a distinct rule that has no effect. @@ -347,24 +361,20 @@ where FI: Fn(DocumentId) -> bool, postings_lists_store, documents_fields_counts_store, synonyms_store, - ) + ); } let start_processing = Instant::now(); let mut raw_documents_processed = Vec::with_capacity(range.len()); - let (automaton_producer, query_enhancer) = AutomatonProducer::new( - reader, - query, - main_store, - synonyms_store, - )?; + let (automaton_producer, query_enhancer) = + AutomatonProducer::new(reader, query, main_store, synonyms_store)?; - let mut automaton_producer = automaton_producer.into_iter(); + let automaton_producer = automaton_producer.into_iter(); let mut automatons = Vec::new(); // aggregate automatons groups by groups after time - while let Some(auts) = automaton_producer.next() { + for auts in automaton_producer { automatons.extend(auts); // we must retrieve the documents associated @@ -374,15 +384,15 @@ where FI: Fn(DocumentId) -> bool, &automatons, &query_enhancer, searchable_attrs.as_ref(), - &main_store, - &postings_lists_store, - &documents_fields_counts_store, + main_store, + postings_lists_store, + documents_fields_counts_store, )?; // stop processing when time is running out if let Some(timeout) = timeout { if !raw_documents_processed.is_empty() && start_processing.elapsed() > timeout { - break + break; } } @@ -409,20 +419,27 @@ where FI: Fn(DocumentId) -> bool, // we have sort enough documents if the last document sorted is after // the end of the requested range, we can continue to the next criterion - if documents_seen >= range.end { continue 'criteria } + if documents_seen >= range.end { + continue 'criteria; + } } } } // once we classified the documents related to the current // automatons we save that as the next valid result - let iter = raw_documents.into_iter().skip(range.start).take(range.len()); + let iter = raw_documents + .into_iter() + .skip(range.start) + .take(range.len()); raw_documents_processed.clear(); raw_documents_processed.extend(iter); // stop processing when time is running out if let Some(timeout) = timeout { - if start_processing.elapsed() > timeout { break } + if start_processing.elapsed() > timeout { + break; + } } } @@ -430,7 +447,7 @@ where FI: Fn(DocumentId) -> bool, // those must be returned let documents = raw_documents_processed .into_iter() - .map(|d| Document::from_raw(d)) + .map(Document::from_raw) .collect(); Ok(documents) @@ -456,24 +473,21 @@ fn raw_query_with_distinct<'c, FI, FD>( documents_fields_counts_store: store::DocumentsFieldsCounts, synonyms_store: store::Synonyms, ) -> MResult> -where FI: Fn(DocumentId) -> bool, - FD: Fn(DocumentId) -> Option, +where + FI: Fn(DocumentId) -> bool, + FD: Fn(DocumentId) -> Option, { let start_processing = Instant::now(); let mut raw_documents_processed = Vec::new(); - let (automaton_producer, query_enhancer) = AutomatonProducer::new( - reader, - query, - main_store, - synonyms_store, - )?; + let (automaton_producer, query_enhancer) = + AutomatonProducer::new(reader, query, main_store, synonyms_store)?; - let mut automaton_producer = automaton_producer.into_iter(); + let automaton_producer = automaton_producer.into_iter(); let mut automatons = Vec::new(); // aggregate automatons groups by groups after time - while let Some(auts) = automaton_producer.next() { + for auts in automaton_producer { automatons.extend(auts); // we must retrieve the documents associated @@ -483,15 +497,15 @@ where FI: Fn(DocumentId) -> bool, &automatons, &query_enhancer, searchable_attrs.as_ref(), - &main_store, - &postings_lists_store, - &documents_fields_counts_store, + main_store, + postings_lists_store, + documents_fields_counts_store, )?; // stop processing when time is running out if let Some(timeout) = timeout { if !raw_documents_processed.is_empty() && start_processing.elapsed() > timeout { - break + break; } } @@ -528,7 +542,7 @@ where FI: Fn(DocumentId) -> bool, Some(filter) => { let entry = filter_map.entry(document.id); *entry.or_insert_with(|| (filter)(document.id)) - }, + } None => true, }; @@ -543,7 +557,9 @@ where FI: Fn(DocumentId) -> bool, } // the requested range end is reached: stop computing distinct - if buf_distinct.len() >= range.end { break } + if buf_distinct.len() >= range.end { + break; + } } documents_seen += group.len(); @@ -558,7 +574,9 @@ where FI: Fn(DocumentId) -> bool, // we have sort enough documents if the last document sorted is after // the end of the requested range, we can continue to the next criterion - if buf_distinct.len() >= range.end { continue 'criteria } + if buf_distinct.len() >= range.end { + continue 'criteria; + } } } } @@ -583,14 +601,18 @@ where FI: Fn(DocumentId) -> bool, if distinct_accepted && seen.len() > range.start { raw_documents_processed.push(document); - if raw_documents_processed.len() == range.len() { break } + if raw_documents_processed.len() == range.len() { + break; + } } } } // stop processing when time is running out if let Some(timeout) = timeout { - if start_processing.elapsed() > timeout { break } + if start_processing.elapsed() > timeout { + break; + } } } @@ -598,7 +620,7 @@ where FI: Fn(DocumentId) -> bool, // those must be returned let documents = raw_documents_processed .into_iter() - .map(|d| Document::from_raw(d)) + .map(Document::from_raw) .collect(); Ok(documents) @@ -611,20 +633,20 @@ mod tests { use std::collections::{BTreeSet, HashMap}; use std::iter::FromIterator; - use fst::{Set, IntoStreamer}; + use fst::{IntoStreamer, Set}; + use meilidb_schema::SchemaAttr; use sdset::SetBuf; use tempfile::TempDir; - use meilidb_schema::SchemaAttr; use crate::automaton::normalize_str; use crate::database::Database; - use crate::DocIndex; use crate::store::Index; + use crate::DocIndex; fn set_from_stream<'f, I, S>(stream: I) -> Set where - I: for<'a> fst::IntoStreamer<'a, Into=S, Item=&'a [u8]>, - S: 'f + for<'a> fst::Streamer<'a, Item=&'a [u8]>, + I: for<'a> fst::IntoStreamer<'a, Into = S, Item = &'a [u8]>, + S: 'f + for<'a> fst::Streamer<'a, Item = &'a [u8]>, { let mut builder = fst::SetBuilder::memory(); builder.extend_stream(stream).unwrap(); @@ -687,14 +709,23 @@ mod tests { let word = word.to_lowercase(); - let alternatives = match self.index.synonyms.synonyms(&writer, word.as_bytes()).unwrap() { + let alternatives = match self + .index + .synonyms + .synonyms(&writer, word.as_bytes()) + .unwrap() + { Some(alternatives) => alternatives, None => fst::Set::default(), }; let new = sdset_into_fstset(&new); - let new_alternatives = set_from_stream(alternatives.op().add(new.into_stream()).r#union()); - self.index.synonyms.put_synonyms(&mut writer, word.as_bytes(), &new_alternatives).unwrap(); + let new_alternatives = + set_from_stream(alternatives.op().add(new.into_stream()).r#union()); + self.index + .synonyms + .put_synonyms(&mut writer, word.as_bytes(), &new_alternatives) + .unwrap(); let synonyms = match self.index.main.synonyms_fst(&writer).unwrap() { Some(synonyms) => synonyms, @@ -702,14 +733,17 @@ mod tests { }; let synonyms_fst = insert_key(&synonyms, word.as_bytes()); - self.index.main.put_synonyms_fst(&mut writer, &synonyms_fst).unwrap(); + self.index + .main + .put_synonyms_fst(&mut writer, &synonyms_fst) + .unwrap(); writer.commit().unwrap(); } } impl<'a> FromIterator<(&'a str, &'a [DocIndex])> for TempDatabase { - fn from_iter>(iter: I) -> Self { + fn from_iter>(iter: I) -> Self { let tempdir = TempDir::new().unwrap(); let database = Database::open_or_create(&tempdir).unwrap(); let index = database.create_index("default").unwrap(); @@ -724,7 +758,10 @@ mod tests { for (word, indexes) in iter { let word = word.to_lowercase().into_bytes(); words_fst.insert(word.clone()); - postings_lists.entry(word).or_insert_with(Vec::new).extend_from_slice(indexes); + postings_lists + .entry(word) + .or_insert_with(Vec::new) + .extend_from_slice(indexes); for idx in indexes { fields_counts.insert((idx.document_id, idx.attribute, idx.word_index), 1); } @@ -736,31 +773,33 @@ mod tests { for (word, postings_list) in postings_lists { let postings_list = SetBuf::from_dirty(postings_list); - index.postings_lists.put_postings_list(&mut writer, &word, &postings_list).unwrap(); + index + .postings_lists + .put_postings_list(&mut writer, &word, &postings_list) + .unwrap(); } for ((docid, attr, _), count) in fields_counts { - let prev = index.documents_fields_counts - .document_field_count( - &mut writer, - docid, - SchemaAttr(attr), - ).unwrap(); + let prev = index + .documents_fields_counts + .document_field_count(&mut writer, docid, SchemaAttr(attr)) + .unwrap(); let prev = prev.unwrap_or(0); - index.documents_fields_counts - .put_document_field_count( - &mut writer, - docid, - SchemaAttr(attr), - prev + count, - ).unwrap(); + index + .documents_fields_counts + .put_document_field_count(&mut writer, docid, SchemaAttr(attr), prev + count) + .unwrap(); } writer.commit().unwrap(); - TempDatabase { database, index, _tempdir: tempdir } + TempDatabase { + database, + index, + _tempdir: tempdir, + } } } @@ -768,8 +807,8 @@ mod tests { fn simple() { let store = TempDatabase::from_iter(vec![ ("iphone", &[doc_char_index(0, 0, 0)][..]), - ("from", &[doc_char_index(0, 1, 1)][..]), - ("apple", &[doc_char_index(0, 2, 2)][..]), + ("from", &[doc_char_index(0, 1, 1)][..]), + ("apple", &[doc_char_index(0, 2, 2)][..]), ]); let env = &store.database.env; @@ -791,9 +830,7 @@ mod tests { #[test] fn simple_synonyms() { - let mut store = TempDatabase::from_iter(vec![ - ("hello", &[doc_index(0, 0)][..]), - ]); + let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]); store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"])); @@ -825,9 +862,7 @@ mod tests { #[test] fn prefix_synonyms() { - let mut store = TempDatabase::from_iter(vec![ - ("hello", &[doc_index(0, 0)][..]), - ]); + let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]); store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"])); store.add_synonym("salut", SetBuf::from_dirty(vec!["hello"])); @@ -872,9 +907,7 @@ mod tests { #[test] fn levenshtein_synonyms() { - let mut store = TempDatabase::from_iter(vec![ - ("hello", &[doc_index(0, 0)][..]), - ]); + let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]); store.add_synonym("salutation", SetBuf::from_dirty(vec!["hello"])); @@ -907,9 +940,9 @@ mod tests { #[test] fn harder_synonyms() { let mut store = TempDatabase::from_iter(vec![ - ("hello", &[doc_index(0, 0)][..]), + ("hello", &[doc_index(0, 0)][..]), ("bonjour", &[doc_index(1, 3)]), - ("salut", &[doc_index(2, 5)]), + ("salut", &[doc_index(2, 5)]), ]); store.add_synonym("hello", SetBuf::from_dirty(vec!["bonjour", "salut"])); @@ -987,17 +1020,22 @@ mod tests { /// Unique word has multi-word synonyms fn unique_to_multiword_synonyms() { let mut store = TempDatabase::from_iter(vec![ - ("new", &[doc_char_index(0, 0, 0)][..]), - ("york", &[doc_char_index(0, 1, 1)][..]), - ("city", &[doc_char_index(0, 2, 2)][..]), + ("new", &[doc_char_index(0, 0, 0)][..]), + ("york", &[doc_char_index(0, 1, 1)][..]), + ("city", &[doc_char_index(0, 2, 2)][..]), ("subway", &[doc_char_index(0, 3, 3)][..]), - - ("NY", &[doc_char_index(1, 0, 0)][..]), + ("NY", &[doc_char_index(1, 0, 0)][..]), ("subway", &[doc_char_index(1, 1, 1)][..]), ]); - store.add_synonym("NY", SetBuf::from_dirty(vec!["NYC", "new york", "new york city"])); - store.add_synonym("NYC", SetBuf::from_dirty(vec!["NY", "new york", "new york city"])); + store.add_synonym( + "NY", + SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]), + ); + store.add_synonym( + "NYC", + SetBuf::from_dirty(vec!["NY", "new york", "new york city"]), + ); let env = &store.database.env; let reader = env.read_txn().unwrap(); @@ -1056,20 +1094,18 @@ mod tests { #[test] fn unique_to_multiword_synonyms_words_proximity() { let mut store = TempDatabase::from_iter(vec![ - ("new", &[doc_char_index(0, 0, 0)][..]), - ("york", &[doc_char_index(0, 1, 1)][..]), - ("city", &[doc_char_index(0, 2, 2)][..]), + ("new", &[doc_char_index(0, 0, 0)][..]), + ("york", &[doc_char_index(0, 1, 1)][..]), + ("city", &[doc_char_index(0, 2, 2)][..]), ("subway", &[doc_char_index(0, 3, 3)][..]), - - ("york", &[doc_char_index(1, 0, 0)][..]), - ("new", &[doc_char_index(1, 1, 1)][..]), + ("york", &[doc_char_index(1, 0, 0)][..]), + ("new", &[doc_char_index(1, 1, 1)][..]), ("subway", &[doc_char_index(1, 2, 2)][..]), - - ("NY", &[doc_char_index(2, 0, 0)][..]), + ("NY", &[doc_char_index(2, 0, 0)][..]), ("subway", &[doc_char_index(2, 1, 1)][..]), ]); - store.add_synonym("NY", SetBuf::from_dirty(vec!["york new"])); + store.add_synonym("NY", SetBuf::from_dirty(vec!["york new"])); let env = &store.database.env; let reader = env.read_txn().unwrap(); @@ -1120,11 +1156,10 @@ mod tests { #[test] fn unique_to_multiword_synonyms_cumulative_word_index() { let mut store = TempDatabase::from_iter(vec![ - ("NY", &[doc_char_index(0, 0, 0)][..]), + ("NY", &[doc_char_index(0, 0, 0)][..]), ("subway", &[doc_char_index(0, 1, 1)][..]), - - ("new", &[doc_char_index(1, 0, 0)][..]), - ("york", &[doc_char_index(1, 1, 1)][..]), + ("new", &[doc_char_index(1, 0, 0)][..]), + ("york", &[doc_char_index(1, 1, 1)][..]), ("subway", &[doc_char_index(1, 2, 2)][..]), ]); @@ -1175,20 +1210,25 @@ mod tests { /// Unique word has multi-word synonyms fn harder_unique_to_multiword_synonyms_one() { let mut store = TempDatabase::from_iter(vec![ - ("new", &[doc_char_index(0, 0, 0)][..]), - ("york", &[doc_char_index(0, 1, 1)][..]), - ("city", &[doc_char_index(0, 2, 2)][..]), - ("yellow", &[doc_char_index(0, 3, 3)][..]), - ("subway", &[doc_char_index(0, 4, 4)][..]), - ("broken", &[doc_char_index(0, 5, 5)][..]), - - ("NY", &[doc_char_index(1, 0, 0)][..]), - ("blue", &[doc_char_index(1, 1, 1)][..]), - ("subway", &[doc_char_index(1, 2, 2)][..]), + ("new", &[doc_char_index(0, 0, 0)][..]), + ("york", &[doc_char_index(0, 1, 1)][..]), + ("city", &[doc_char_index(0, 2, 2)][..]), + ("yellow", &[doc_char_index(0, 3, 3)][..]), + ("subway", &[doc_char_index(0, 4, 4)][..]), + ("broken", &[doc_char_index(0, 5, 5)][..]), + ("NY", &[doc_char_index(1, 0, 0)][..]), + ("blue", &[doc_char_index(1, 1, 1)][..]), + ("subway", &[doc_char_index(1, 2, 2)][..]), ]); - store.add_synonym("NY", SetBuf::from_dirty(vec!["NYC", "new york", "new york city"])); - store.add_synonym("NYC", SetBuf::from_dirty(vec!["NY", "new york", "new york city"])); + store.add_synonym( + "NY", + SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]), + ); + store.add_synonym( + "NYC", + SetBuf::from_dirty(vec!["NY", "new york", "new york city"]), + ); let env = &store.database.env; let reader = env.read_txn().unwrap(); @@ -1249,21 +1289,26 @@ mod tests { /// Unique word has multi-word synonyms fn even_harder_unique_to_multiword_synonyms() { let mut store = TempDatabase::from_iter(vec![ - ("new", &[doc_char_index(0, 0, 0)][..]), - ("york", &[doc_char_index(0, 1, 1)][..]), - ("city", &[doc_char_index(0, 2, 2)][..]), - ("yellow", &[doc_char_index(0, 3, 3)][..]), + ("new", &[doc_char_index(0, 0, 0)][..]), + ("york", &[doc_char_index(0, 1, 1)][..]), + ("city", &[doc_char_index(0, 2, 2)][..]), + ("yellow", &[doc_char_index(0, 3, 3)][..]), ("underground", &[doc_char_index(0, 4, 4)][..]), - ("train", &[doc_char_index(0, 5, 5)][..]), - ("broken", &[doc_char_index(0, 6, 6)][..]), - - ("NY", &[doc_char_index(1, 0, 0)][..]), - ("blue", &[doc_char_index(1, 1, 1)][..]), - ("subway", &[doc_char_index(1, 2, 2)][..]), + ("train", &[doc_char_index(0, 5, 5)][..]), + ("broken", &[doc_char_index(0, 6, 6)][..]), + ("NY", &[doc_char_index(1, 0, 0)][..]), + ("blue", &[doc_char_index(1, 1, 1)][..]), + ("subway", &[doc_char_index(1, 2, 2)][..]), ]); - store.add_synonym("NY", SetBuf::from_dirty(vec!["NYC", "new york", "new york city"])); - store.add_synonym("NYC", SetBuf::from_dirty(vec!["NY", "new york", "new york city"])); + store.add_synonym( + "NY", + SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]), + ); + store.add_synonym( + "NYC", + SetBuf::from_dirty(vec!["NY", "new york", "new york city"]), + ); store.add_synonym("subway", SetBuf::from_dirty(vec!["underground train"])); let env = &store.database.env; @@ -1330,30 +1375,36 @@ mod tests { /// Multi-word has multi-word synonyms fn multiword_to_multiword_synonyms() { let mut store = TempDatabase::from_iter(vec![ - ("NY", &[doc_char_index(0, 0, 0)][..]), - ("subway", &[doc_char_index(0, 1, 1)][..]), - - ("NYC", &[doc_char_index(1, 0, 0)][..]), - ("blue", &[doc_char_index(1, 1, 1)][..]), - ("subway", &[doc_char_index(1, 2, 2)][..]), - ("broken", &[doc_char_index(1, 3, 3)][..]), - - ("new", &[doc_char_index(2, 0, 0)][..]), - ("york", &[doc_char_index(2, 1, 1)][..]), + ("NY", &[doc_char_index(0, 0, 0)][..]), + ("subway", &[doc_char_index(0, 1, 1)][..]), + ("NYC", &[doc_char_index(1, 0, 0)][..]), + ("blue", &[doc_char_index(1, 1, 1)][..]), + ("subway", &[doc_char_index(1, 2, 2)][..]), + ("broken", &[doc_char_index(1, 3, 3)][..]), + ("new", &[doc_char_index(2, 0, 0)][..]), + ("york", &[doc_char_index(2, 1, 1)][..]), ("underground", &[doc_char_index(2, 2, 2)][..]), - ("train", &[doc_char_index(2, 3, 3)][..]), - ("broken", &[doc_char_index(2, 4, 4)][..]), + ("train", &[doc_char_index(2, 3, 3)][..]), + ("broken", &[doc_char_index(2, 4, 4)][..]), ]); - store.add_synonym("new york", SetBuf::from_dirty(vec![ "NYC", "NY", "new york city" ])); - store.add_synonym("new york city", SetBuf::from_dirty(vec![ "NYC", "NY", "new york" ])); - store.add_synonym("underground train", SetBuf::from_dirty(vec![ "subway" ])); + store.add_synonym( + "new york", + SetBuf::from_dirty(vec!["NYC", "NY", "new york city"]), + ); + store.add_synonym( + "new york city", + SetBuf::from_dirty(vec!["NYC", "NY", "new york"]), + ); + store.add_synonym("underground train", SetBuf::from_dirty(vec!["subway"])); let env = &store.database.env; let reader = env.read_txn().unwrap(); let builder = store.query_builder(); - let results = builder.query(&reader, "new york underground train broken", 0..20).unwrap(); + let results = builder + .query(&reader, "new york underground train broken", 0..20) + .unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { @@ -1390,7 +1441,9 @@ mod tests { assert_matches!(iter.next(), None); let builder = store.query_builder(); - let results = builder.query(&reader, "new york city underground train broken", 0..20).unwrap(); + let results = builder + .query(&reader, "new york city underground train broken", 0..20) + .unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { @@ -1436,14 +1489,14 @@ mod tests { #[test] fn intercrossed_multiword_synonyms() { let mut store = TempDatabase::from_iter(vec![ - ("new", &[doc_index(0, 0)][..]), - ("york", &[doc_index(0, 1)][..]), - ("big", &[doc_index(0, 2)][..]), - ("city", &[doc_index(0, 3)][..]), + ("new", &[doc_index(0, 0)][..]), + ("york", &[doc_index(0, 1)][..]), + ("big", &[doc_index(0, 2)][..]), + ("city", &[doc_index(0, 3)][..]), ]); - store.add_synonym("new york", SetBuf::from_dirty(vec![ "new york city" ])); - store.add_synonym("new york city", SetBuf::from_dirty(vec![ "new york" ])); + store.add_synonym("new york", SetBuf::from_dirty(vec!["new york city"])); + store.add_synonym("new york city", SetBuf::from_dirty(vec!["new york"])); let env = &store.database.env; let reader = env.read_txn().unwrap(); @@ -1469,16 +1522,14 @@ mod tests { assert_matches!(iter.next(), None); let mut store = TempDatabase::from_iter(vec![ - ("NY", &[doc_index(0, 0)][..]), - ("city", &[doc_index(0, 1)][..]), + ("NY", &[doc_index(0, 0)][..]), + ("city", &[doc_index(0, 1)][..]), ("subway", &[doc_index(0, 2)][..]), - - ("NY", &[doc_index(1, 0)][..]), + ("NY", &[doc_index(1, 0)][..]), ("subway", &[doc_index(1, 1)][..]), - - ("NY", &[doc_index(2, 0)][..]), - ("york", &[doc_index(2, 1)][..]), - ("city", &[doc_index(2, 2)][..]), + ("NY", &[doc_index(2, 0)][..]), + ("york", &[doc_index(2, 1)][..]), + ("city", &[doc_index(2, 2)][..]), ("subway", &[doc_index(2, 3)][..]), ]); @@ -1525,20 +1576,22 @@ mod tests { #[test] fn cumulative_word_indices() { let mut store = TempDatabase::from_iter(vec![ - ("NYC", &[doc_index(0, 0)][..]), - ("long", &[doc_index(0, 1)][..]), + ("NYC", &[doc_index(0, 0)][..]), + ("long", &[doc_index(0, 1)][..]), ("subway", &[doc_index(0, 2)][..]), - ("cool", &[doc_index(0, 3)][..]), + ("cool", &[doc_index(0, 3)][..]), ]); store.add_synonym("new york city", SetBuf::from_dirty(vec!["NYC"])); - store.add_synonym("subway", SetBuf::from_dirty(vec!["underground train"])); + store.add_synonym("subway", SetBuf::from_dirty(vec!["underground train"])); let env = &store.database.env; let reader = env.read_txn().unwrap(); let builder = store.query_builder(); - let results = builder.query(&reader, "new york city long subway cool ", 0..20).unwrap(); + let results = builder + .query(&reader, "new york city long subway cool ", 0..20) + .unwrap(); let mut iter = results.into_iter(); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { @@ -1560,8 +1613,7 @@ mod tests { let mut store = TempDatabase::from_iter(vec![ ("telephone", &[doc_index(0, 0)][..]), // meilidb indexes the unidecoded ("téléphone", &[doc_index(0, 0)][..]), // and the original words on the same DocIndex - - ("iphone", &[doc_index(1, 0)][..]), + ("iphone", &[doc_index(1, 0)][..]), ]); store.add_synonym("téléphone", SetBuf::from_dirty(vec!["iphone"])); @@ -1624,8 +1676,8 @@ mod tests { #[test] fn simple_concatenation() { let store = TempDatabase::from_iter(vec![ - ("iphone", &[doc_index(0, 0)][..]), - ("case", &[doc_index(0, 1)][..]), + ("iphone", &[doc_index(0, 0)][..]), + ("case", &[doc_index(0, 1)][..]), ]); let env = &store.database.env; diff --git a/meilidb-core/src/ranked_map.rs b/meilidb-core/src/ranked_map.rs index d5bd15873..2675339eb 100644 --- a/meilidb-core/src/ranked_map.rs +++ b/meilidb-core/src/ranked_map.rs @@ -2,12 +2,11 @@ use std::io::{Read, Write}; use hashbrown::HashMap; use meilidb_schema::SchemaAttr; -use serde::{Serialize, Deserialize}; +use serde::{Deserialize, Serialize}; use crate::{DocumentId, Number}; -#[derive(Debug, Default, Clone, PartialEq, Eq)] -#[derive(Serialize, Deserialize)] +#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] #[serde(transparent)] pub struct RankedMap(HashMap<(DocumentId, SchemaAttr), Number>); @@ -16,6 +15,10 @@ impl RankedMap { self.0.len() } + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + pub fn insert(&mut self, document: DocumentId, attribute: SchemaAttr, number: Number) { self.0.insert((document, attribute), number); } diff --git a/meilidb-core/src/raw_document.rs b/meilidb-core/src/raw_document.rs index 16cc9edda..ccdaa0b2f 100644 --- a/meilidb-core/src/raw_document.rs +++ b/meilidb-core/src/raw_document.rs @@ -1,11 +1,11 @@ -use std::sync::Arc; use std::fmt; +use std::sync::Arc; use meilidb_schema::SchemaAttr; use sdset::SetBuf; use slice_group_by::GroupBy; -use crate::{TmpMatch, DocumentId, Highlight}; +use crate::{DocumentId, Highlight, TmpMatch}; #[derive(Clone)] pub struct RawDocument { @@ -20,7 +20,13 @@ impl RawDocument { let r = self.matches.range; // it is safe because construction/modifications // can only be done in this module - unsafe { &self.matches.matches.query_index.get_unchecked(r.start..r.end) } + unsafe { + &self + .matches + .matches + .query_index + .get_unchecked(r.start..r.end) + } } pub fn distance(&self) -> &[u8] { @@ -41,7 +47,13 @@ impl RawDocument { let r = self.matches.range; // it is safe because construction/modifications // can only be done in this module - unsafe { &self.matches.matches.word_index.get_unchecked(r.start..r.end) } + unsafe { + &self + .matches + .matches + .word_index + .get_unchecked(r.start..r.end) + } } pub fn is_exact(&self) -> &[bool] { @@ -55,12 +67,32 @@ impl RawDocument { impl fmt::Debug for RawDocument { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.write_str("RawDocument {\r\n")?; - f.write_fmt(format_args!("{:>15}: {:?},\r\n", "id", self.id))?; - f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "query_index", self.query_index()))?; - f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "distance", self.distance()))?; - f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "attribute", self.attribute()))?; - f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "word_index", self.word_index()))?; - f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "is_exact", self.is_exact()))?; + f.write_fmt(format_args!("{:>15}: {:?},\r\n", "id", self.id))?; + f.write_fmt(format_args!( + "{:>15}: {:^5?},\r\n", + "query_index", + self.query_index() + ))?; + f.write_fmt(format_args!( + "{:>15}: {:^5?},\r\n", + "distance", + self.distance() + ))?; + f.write_fmt(format_args!( + "{:>15}: {:^5?},\r\n", + "attribute", + self.attribute() + ))?; + f.write_fmt(format_args!( + "{:>15}: {:^5?},\r\n", + "word_index", + self.word_index() + ))?; + f.write_fmt(format_args!( + "{:>15}: {:^5?},\r\n", + "is_exact", + self.is_exact() + ))?; f.write_str("}")?; Ok(()) } @@ -70,8 +102,7 @@ pub fn raw_documents_from( matches: SetBuf<(DocumentId, TmpMatch)>, highlights: SetBuf<(DocumentId, Highlight)>, fields_counts: SetBuf<(DocumentId, SchemaAttr, u64)>, -) -> Vec -{ +) -> Vec { let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new(); let mut matches2 = Matches::with_capacity(matches.len()); @@ -94,10 +125,21 @@ pub fn raw_documents_from( } let matches = Arc::new(matches2); - docs_ranges.into_iter().map(|(id, range, highlights, fields_counts)| { - let matches = SharedMatches { range, matches: matches.clone() }; - RawDocument { id, matches, highlights, fields_counts } - }).collect() + docs_ranges + .into_iter() + .map(|(id, range, highlights, fields_counts)| { + let matches = SharedMatches { + range, + matches: matches.clone(), + }; + RawDocument { + id, + matches, + highlights, + fields_counts, + } + }) + .collect() } #[derive(Debug, Copy, Clone)] diff --git a/meilidb-core/src/raw_indexer.rs b/meilidb-core/src/raw_indexer.rs index 980b622f7..396134436 100644 --- a/meilidb-core/src/raw_indexer.rs +++ b/meilidb-core/src/raw_indexer.rs @@ -1,10 +1,10 @@ use std::collections::{BTreeMap, HashMap}; use std::convert::TryFrom; +use crate::{DocIndex, DocumentId}; use deunicode::deunicode_with_tofu; -use crate::{DocumentId, DocIndex}; use meilidb_schema::SchemaAttr; -use meilidb_tokenizer::{is_cjk, Tokenizer, SeqTokenizer, Token}; +use meilidb_tokenizer::{is_cjk, SeqTokenizer, Token, Tokenizer}; use sdset::SetBuf; type Word = Vec; // TODO make it be a SmallVec @@ -60,7 +60,9 @@ impl RawIndexer { &mut self.docs_words, ); - if !must_continue { break } + if !must_continue { + break; + } number_of_words += 1; } @@ -70,8 +72,9 @@ impl RawIndexer { } pub fn index_text_seq<'a, I, IT>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I) - where I: IntoIterator, - IT: Iterator + Clone, + where + I: IntoIterator, + IT: Iterator + Clone, { // TODO serialize this to one call to the SeqTokenizer loop @@ -88,14 +91,25 @@ impl RawIndexer { &mut self.docs_words, ); - if !must_continue { break } + if !must_continue { + break; + } } - let deunicoded: Vec<_> = lowercased.into_iter().map(|lowercase_text| { - if lowercase_text.contains(is_cjk) { return lowercase_text } - let deunicoded = deunicode_with_tofu(&lowercase_text, ""); - if lowercase_text != deunicoded { deunicoded } else { lowercase_text } - }).collect(); + let deunicoded: Vec<_> = lowercased + .into_iter() + .map(|lowercase_text| { + if lowercase_text.contains(is_cjk) { + return lowercase_text; + } + let deunicoded = deunicode_with_tofu(&lowercase_text, ""); + if lowercase_text != deunicoded { + deunicoded + } else { + lowercase_text + } + }) + .collect(); let iter = deunicoded.iter().map(|t| t.as_str()); for token in SeqTokenizer::new(iter) { @@ -108,17 +122,21 @@ impl RawIndexer { &mut self.docs_words, ); - if !must_continue { break } + if !must_continue { + break; + } } } pub fn build(self) -> Indexed { - let words_doc_indexes = self.words_doc_indexes + let words_doc_indexes = self + .words_doc_indexes .into_iter() .map(|(word, indexes)| (word, SetBuf::from_dirty(indexes))) .collect(); - let docs_words = self.docs_words + let docs_words = self + .docs_words .into_iter() .map(|(id, mut words)| { words.sort_unstable(); @@ -127,7 +145,16 @@ impl RawIndexer { }) .collect(); - Indexed { words_doc_indexes, docs_words } + Indexed { + words_doc_indexes, + docs_words, + } + } +} + +impl Default for RawIndexer { + fn default() -> Self { + Self::new() } } @@ -138,16 +165,20 @@ fn index_token( word_limit: usize, words_doc_indexes: &mut BTreeMap>, docs_words: &mut HashMap>, -) -> bool -{ - if token.word_index >= word_limit { return false } +) -> bool { + if token.word_index >= word_limit { + return false; + } match token_to_docindex(id, attr, token) { Some(docindex) => { let word = Vec::from(token.word); - words_doc_indexes.entry(word.clone()).or_insert_with(Vec::new).push(docindex); + words_doc_indexes + .entry(word.clone()) + .or_insert_with(Vec::new) + .push(docindex); docs_words.entry(id).or_insert_with(Vec::new).push(word); - }, + } None => return false, } @@ -183,7 +214,9 @@ mod tests { let text = "Zut, l’aspirateur, j’ai oublié de l’éteindre !"; indexer.index_text(docid, attr, text); - let Indexed { words_doc_indexes, .. } = indexer.build(); + let Indexed { + words_doc_indexes, .. + } = indexer.build(); assert!(words_doc_indexes.get(&b"l"[..]).is_some()); assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some()); @@ -191,7 +224,9 @@ mod tests { assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some()); // with the ugly apostrophe... - assert!(words_doc_indexes.get(&"l’éteindre".to_owned().into_bytes()).is_some()); + assert!(words_doc_indexes + .get(&"l’éteindre".to_owned().into_bytes()) + .is_some()); } #[test] @@ -203,7 +238,9 @@ mod tests { let text = vec!["Zut, l’aspirateur, j’ai oublié de l’éteindre !"]; indexer.index_text_seq(docid, attr, text); - let Indexed { words_doc_indexes, .. } = indexer.build(); + let Indexed { + words_doc_indexes, .. + } = indexer.build(); assert!(words_doc_indexes.get(&b"l"[..]).is_some()); assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some()); @@ -211,6 +248,8 @@ mod tests { assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some()); // with the ugly apostrophe... - assert!(words_doc_indexes.get(&"l’éteindre".to_owned().into_bytes()).is_some()); + assert!(words_doc_indexes + .get(&"l’éteindre".to_owned().into_bytes()) + .is_some()); } } diff --git a/meilidb-core/src/reordered_attrs.rs b/meilidb-core/src/reordered_attrs.rs index ed11045ab..b2f9f1d6c 100644 --- a/meilidb-core/src/reordered_attrs.rs +++ b/meilidb-core/src/reordered_attrs.rs @@ -6,7 +6,10 @@ pub struct ReorderedAttrs { impl ReorderedAttrs { pub fn new() -> ReorderedAttrs { - ReorderedAttrs { count: 0, reorders: Vec::new() } + ReorderedAttrs { + count: 0, + reorders: Vec::new(), + } } pub fn insert_attribute(&mut self, attribute: u16) { diff --git a/meilidb-core/src/serde/convert_to_number.rs b/meilidb-core/src/serde/convert_to_number.rs index aec22730a..a67e01692 100644 --- a/meilidb-core/src/serde/convert_to_number.rs +++ b/meilidb-core/src/serde/convert_to_number.rs @@ -77,13 +77,18 @@ impl ser::Serializer for ConvertToNumber { } fn serialize_none(self) -> Result { - Err(SerializerError::UnrankableType { type_name: "Option" }) + Err(SerializerError::UnrankableType { + type_name: "Option", + }) } fn serialize_some(self, _value: &T) -> Result - where T: Serialize, + where + T: Serialize, { - Err(SerializerError::UnrankableType { type_name: "Option" }) + Err(SerializerError::UnrankableType { + type_name: "Option", + }) } fn serialize_unit(self) -> Result { @@ -91,25 +96,29 @@ impl ser::Serializer for ConvertToNumber { } fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(SerializerError::UnrankableType { type_name: "unit struct" }) + Err(SerializerError::UnrankableType { + type_name: "unit struct", + }) } fn serialize_unit_variant( self, _name: &'static str, _variant_index: u32, - _variant: &'static str - ) -> Result - { - Err(SerializerError::UnrankableType { type_name: "unit variant" }) + _variant: &'static str, + ) -> Result { + Err(SerializerError::UnrankableType { + type_name: "unit variant", + }) } fn serialize_newtype_struct( self, _name: &'static str, - value: &T + value: &T, ) -> Result - where T: Serialize, + where + T: Serialize, { value.serialize(self) } @@ -119,15 +128,20 @@ impl ser::Serializer for ConvertToNumber { _name: &'static str, _variant_index: u32, _variant: &'static str, - _value: &T + _value: &T, ) -> Result - where T: Serialize, + where + T: Serialize, { - Err(SerializerError::UnrankableType { type_name: "newtype variant" }) + Err(SerializerError::UnrankableType { + type_name: "newtype variant", + }) } fn serialize_seq(self, _len: Option) -> Result { - Err(SerializerError::UnrankableType { type_name: "sequence" }) + Err(SerializerError::UnrankableType { + type_name: "sequence", + }) } fn serialize_tuple(self, _len: usize) -> Result { @@ -137,10 +151,11 @@ impl ser::Serializer for ConvertToNumber { fn serialize_tuple_struct( self, _name: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnrankableType { type_name: "tuple struct" }) + _len: usize, + ) -> Result { + Err(SerializerError::UnrankableType { + type_name: "tuple struct", + }) } fn serialize_tuple_variant( @@ -148,10 +163,11 @@ impl ser::Serializer for ConvertToNumber { _name: &'static str, _variant_index: u32, _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnrankableType { type_name: "tuple variant" }) + _len: usize, + ) -> Result { + Err(SerializerError::UnrankableType { + type_name: "tuple variant", + }) } fn serialize_map(self, _len: Option) -> Result { @@ -161,10 +177,11 @@ impl ser::Serializer for ConvertToNumber { fn serialize_struct( self, _name: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnrankableType { type_name: "struct" }) + _len: usize, + ) -> Result { + Err(SerializerError::UnrankableType { + type_name: "struct", + }) } fn serialize_struct_variant( @@ -172,9 +189,10 @@ impl ser::Serializer for ConvertToNumber { _name: &'static str, _variant_index: u32, _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnrankableType { type_name: "struct variant" }) + _len: usize, + ) -> Result { + Err(SerializerError::UnrankableType { + type_name: "struct variant", + }) } } diff --git a/meilidb-core/src/serde/convert_to_string.rs b/meilidb-core/src/serde/convert_to_string.rs index cd109f534..2f1fe0ca1 100644 --- a/meilidb-core/src/serde/convert_to_string.rs +++ b/meilidb-core/src/serde/convert_to_string.rs @@ -1,5 +1,5 @@ -use serde::Serialize; use serde::ser; +use serde::Serialize; use super::SerializerError; @@ -17,7 +17,9 @@ impl ser::Serializer for ConvertToString { type SerializeStructVariant = ser::Impossible; fn serialize_bool(self, _value: bool) -> Result { - Err(SerializerError::UnserializableType { type_name: "boolean" }) + Err(SerializerError::UnserializableType { + type_name: "boolean", + }) } fn serialize_char(self, value: char) -> Result { @@ -73,13 +75,18 @@ impl ser::Serializer for ConvertToString { } fn serialize_none(self) -> Result { - Err(SerializerError::UnserializableType { type_name: "Option" }) + Err(SerializerError::UnserializableType { + type_name: "Option", + }) } fn serialize_some(self, _value: &T) -> Result - where T: Serialize, + where + T: Serialize, { - Err(SerializerError::UnserializableType { type_name: "Option" }) + Err(SerializerError::UnserializableType { + type_name: "Option", + }) } fn serialize_unit(self) -> Result { @@ -87,25 +94,29 @@ impl ser::Serializer for ConvertToString { } fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(SerializerError::UnserializableType { type_name: "unit struct" }) + Err(SerializerError::UnserializableType { + type_name: "unit struct", + }) } fn serialize_unit_variant( self, _name: &'static str, _variant_index: u32, - _variant: &'static str - ) -> Result - { - Err(SerializerError::UnserializableType { type_name: "unit variant" }) + _variant: &'static str, + ) -> Result { + Err(SerializerError::UnserializableType { + type_name: "unit variant", + }) } fn serialize_newtype_struct( self, _name: &'static str, - value: &T + value: &T, ) -> Result - where T: Serialize, + where + T: Serialize, { value.serialize(self) } @@ -115,15 +126,20 @@ impl ser::Serializer for ConvertToString { _name: &'static str, _variant_index: u32, _variant: &'static str, - _value: &T + _value: &T, ) -> Result - where T: Serialize, + where + T: Serialize, { - Err(SerializerError::UnserializableType { type_name: "newtype variant" }) + Err(SerializerError::UnserializableType { + type_name: "newtype variant", + }) } fn serialize_seq(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { type_name: "sequence" }) + Err(SerializerError::UnserializableType { + type_name: "sequence", + }) } fn serialize_tuple(self, _len: usize) -> Result { @@ -133,10 +149,11 @@ impl ser::Serializer for ConvertToString { fn serialize_tuple_struct( self, _name: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { type_name: "tuple struct" }) + _len: usize, + ) -> Result { + Err(SerializerError::UnserializableType { + type_name: "tuple struct", + }) } fn serialize_tuple_variant( @@ -144,10 +161,11 @@ impl ser::Serializer for ConvertToString { _name: &'static str, _variant_index: u32, _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { type_name: "tuple variant" }) + _len: usize, + ) -> Result { + Err(SerializerError::UnserializableType { + type_name: "tuple variant", + }) } fn serialize_map(self, _len: Option) -> Result { @@ -157,10 +175,11 @@ impl ser::Serializer for ConvertToString { fn serialize_struct( self, _name: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { type_name: "struct" }) + _len: usize, + ) -> Result { + Err(SerializerError::UnserializableType { + type_name: "struct", + }) } fn serialize_struct_variant( @@ -168,9 +187,10 @@ impl ser::Serializer for ConvertToString { _name: &'static str, _variant_index: u32, _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { type_name: "struct variant" }) + _len: usize, + ) -> Result { + Err(SerializerError::UnserializableType { + type_name: "struct variant", + }) } } diff --git a/meilidb-core/src/serde/deserializer.rs b/meilidb-core/src/serde/deserializer.rs index df5a8b502..3c99b365b 100644 --- a/meilidb-core/src/serde/deserializer.rs +++ b/meilidb-core/src/serde/deserializer.rs @@ -1,12 +1,12 @@ use std::collections::HashSet; use std::io::Cursor; -use std::{fmt, error::Error}; +use std::{error::Error, fmt}; use meilidb_schema::{Schema, SchemaAttr}; -use serde_json::Error as SerdeJsonError; -use serde_json::Deserializer as SerdeJsonDeserializer; -use serde_json::de::IoRead as SerdeJsonIoRead; use serde::{de, forward_to_deserialize_any}; +use serde_json::de::IoRead as SerdeJsonIoRead; +use serde_json::Deserializer as SerdeJsonDeserializer; +use serde_json::Error as SerdeJsonError; use crate::store::DocumentsFields; use crate::DocumentId; @@ -60,7 +60,8 @@ impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> { type Error = DeserializerError; fn deserialize_any(self, visitor: V) -> Result - where V: de::Visitor<'de> + where + V: de::Visitor<'de>, { self.deserialize_map(visitor) } @@ -72,16 +73,21 @@ impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> { } fn deserialize_map(self, visitor: V) -> Result - where V: de::Visitor<'de> + where + V: de::Visitor<'de>, { let mut error = None; - let iter = self.documents_fields + let iter = self + .documents_fields .document_fields(self.reader, self.document_id)? .filter_map(|result| { let (attr, value) = match result { Ok(value) => value, - Err(e) => { error = Some(e); return None }, + Err(e) => { + error = Some(e); + return None; + } }; let is_displayed = self.schema.props(attr).is_displayed(); @@ -99,7 +105,9 @@ impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> { }); let map_deserializer = de::value::MapDeserializer::new(iter); - let result = visitor.visit_map(map_deserializer).map_err(DeserializerError::from); + let result = visitor + .visit_map(map_deserializer) + .map_err(DeserializerError::from); match error.take() { Some(error) => Err(error.into()), @@ -122,7 +130,8 @@ impl<'de> de::Deserializer<'de> for Value { type Error = SerdeJsonError; fn deserialize_any(mut self, visitor: V) -> Result - where V: de::Visitor<'de> + where + V: de::Visitor<'de>, { self.0.deserialize_any(visitor) } diff --git a/meilidb-core/src/serde/extract_document_id.rs b/meilidb-core/src/serde/extract_document_id.rs index da90101e2..5c2a81d2e 100644 --- a/meilidb-core/src/serde/extract_document_id.rs +++ b/meilidb-core/src/serde/extract_document_id.rs @@ -5,13 +5,14 @@ use serde::{ser, Serialize}; use serde_json::Value; use siphasher::sip::SipHasher; -use super::{SerializerError, ConvertToString}; +use super::{ConvertToString, SerializerError}; pub fn extract_document_id( identifier: &str, document: &D, ) -> Result, SerializerError> -where D: serde::Serialize, +where + D: serde::Serialize, { let serializer = ExtractDocumentId { identifier }; document.serialize(serializer) @@ -77,13 +78,18 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> { } fn serialize_none(self) -> Result { - Err(SerializerError::UnserializableType { type_name: "Option" }) + Err(SerializerError::UnserializableType { + type_name: "Option", + }) } fn serialize_some(self, _value: &T) -> Result - where T: Serialize, + where + T: Serialize, { - Err(SerializerError::UnserializableType { type_name: "Option" }) + Err(SerializerError::UnserializableType { + type_name: "Option", + }) } fn serialize_unit(self) -> Result { @@ -91,25 +97,29 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> { } fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(SerializerError::UnserializableType { type_name: "unit struct" }) + Err(SerializerError::UnserializableType { + type_name: "unit struct", + }) } fn serialize_unit_variant( self, _name: &'static str, _variant_index: u32, - _variant: &'static str - ) -> Result - { - Err(SerializerError::UnserializableType { type_name: "unit variant" }) + _variant: &'static str, + ) -> Result { + Err(SerializerError::UnserializableType { + type_name: "unit variant", + }) } fn serialize_newtype_struct( self, _name: &'static str, - value: &T + value: &T, ) -> Result - where T: Serialize, + where + T: Serialize, { value.serialize(self) } @@ -119,15 +129,20 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> { _name: &'static str, _variant_index: u32, _variant: &'static str, - _value: &T + _value: &T, ) -> Result - where T: Serialize, + where + T: Serialize, { - Err(SerializerError::UnserializableType { type_name: "newtype variant" }) + Err(SerializerError::UnserializableType { + type_name: "newtype variant", + }) } fn serialize_seq(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { type_name: "sequence" }) + Err(SerializerError::UnserializableType { + type_name: "sequence", + }) } fn serialize_tuple(self, _len: usize) -> Result { @@ -137,10 +152,11 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> { fn serialize_tuple_struct( self, _name: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { type_name: "tuple struct" }) + _len: usize, + ) -> Result { + Err(SerializerError::UnserializableType { + type_name: "tuple struct", + }) } fn serialize_tuple_variant( @@ -148,10 +164,11 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> { _name: &'static str, _variant_index: u32, _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { type_name: "tuple variant" }) + _len: usize, + ) -> Result { + Err(SerializerError::UnserializableType { + type_name: "tuple variant", + }) } fn serialize_map(self, _len: Option) -> Result { @@ -167,9 +184,8 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> { fn serialize_struct( self, _name: &'static str, - _len: usize - ) -> Result - { + _len: usize, + ) -> Result { let serializer = ExtractDocumentIdStructSerializer { identifier: self.identifier, document_id: None, @@ -183,10 +199,11 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> { _name: &'static str, _variant_index: u32, _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { type_name: "struct variant" }) + _len: usize, + ) -> Result { + Err(SerializerError::UnserializableType { + type_name: "struct variant", + }) } } @@ -201,7 +218,8 @@ impl<'a> ser::SerializeMap for ExtractDocumentIdMapSerializer<'a> { type Error = SerializerError; fn serialize_key(&mut self, key: &T) -> Result<(), Self::Error> - where T: Serialize, + where + T: Serialize, { let key = key.serialize(ConvertToString)?; self.current_key_name = Some(key); @@ -209,7 +227,8 @@ impl<'a> ser::SerializeMap for ExtractDocumentIdMapSerializer<'a> { } fn serialize_value(&mut self, value: &T) -> Result<(), Self::Error> - where T: Serialize, + where + T: Serialize, { let key = self.current_key_name.take().unwrap(); self.serialize_entry(&key, value) @@ -218,9 +237,11 @@ impl<'a> ser::SerializeMap for ExtractDocumentIdMapSerializer<'a> { fn serialize_entry( &mut self, key: &K, - value: &V + value: &V, ) -> Result<(), Self::Error> - where K: Serialize, V: Serialize, + where + K: Serialize, + V: Serialize, { let key = key.serialize(ConvertToString)?; @@ -252,9 +273,10 @@ impl<'a> ser::SerializeStruct for ExtractDocumentIdStructSerializer<'a> { fn serialize_field( &mut self, key: &'static str, - value: &T + value: &T, ) -> Result<(), Self::Error> - where T: Serialize, + where + T: Serialize, { if self.identifier == key { let value = serde_json::to_string(value).and_then(|s| serde_json::from_str(&s))?; diff --git a/meilidb-core/src/serde/indexer.rs b/meilidb-core/src/serde/indexer.rs index 514b97951..04d24a63a 100644 --- a/meilidb-core/src/serde/indexer.rs +++ b/meilidb-core/src/serde/indexer.rs @@ -2,9 +2,9 @@ use meilidb_schema::SchemaAttr; use serde::ser; use serde::Serialize; -use crate::DocumentId; +use super::{ConvertToString, SerializerError}; use crate::raw_indexer::RawIndexer; -use super::{SerializerError, ConvertToString}; +use crate::DocumentId; pub struct Indexer<'a> { pub attribute: SchemaAttr, @@ -24,7 +24,9 @@ impl<'a> ser::Serializer for Indexer<'a> { type SerializeStructVariant = ser::Impossible; fn serialize_bool(self, _value: bool) -> Result { - Err(SerializerError::UnindexableType { type_name: "boolean" }) + Err(SerializerError::UnindexableType { + type_name: "boolean", + }) } fn serialize_char(self, value: char) -> Result { @@ -83,7 +85,9 @@ impl<'a> ser::Serializer for Indexer<'a> { } fn serialize_str(self, text: &str) -> Result { - let number_of_words = self.indexer.index_text(self.document_id, self.attribute, text); + let number_of_words = self + .indexer + .index_text(self.document_id, self.attribute, text); Ok(Some(number_of_words)) } @@ -92,14 +96,19 @@ impl<'a> ser::Serializer for Indexer<'a> { } fn serialize_none(self) -> Result { - Err(SerializerError::UnindexableType { type_name: "Option" }) + Err(SerializerError::UnindexableType { + type_name: "Option", + }) } fn serialize_some(self, value: &T) -> Result - where T: ser::Serialize, + where + T: ser::Serialize, { let text = value.serialize(ConvertToString)?; - let number_of_words = self.indexer.index_text(self.document_id, self.attribute, &text); + let number_of_words = self + .indexer + .index_text(self.document_id, self.attribute, &text); Ok(Some(number_of_words)) } @@ -108,25 +117,29 @@ impl<'a> ser::Serializer for Indexer<'a> { } fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(SerializerError::UnindexableType { type_name: "unit struct" }) + Err(SerializerError::UnindexableType { + type_name: "unit struct", + }) } fn serialize_unit_variant( self, _name: &'static str, _variant_index: u32, - _variant: &'static str - ) -> Result - { - Err(SerializerError::UnindexableType { type_name: "unit variant" }) + _variant: &'static str, + ) -> Result { + Err(SerializerError::UnindexableType { + type_name: "unit variant", + }) } fn serialize_newtype_struct( self, _name: &'static str, - value: &T + value: &T, ) -> Result - where T: ser::Serialize, + where + T: ser::Serialize, { value.serialize(self) } @@ -136,11 +149,14 @@ impl<'a> ser::Serializer for Indexer<'a> { _name: &'static str, _variant_index: u32, _variant: &'static str, - _value: &T + _value: &T, ) -> Result - where T: ser::Serialize, + where + T: ser::Serialize, { - Err(SerializerError::UnindexableType { type_name: "newtype variant" }) + Err(SerializerError::UnindexableType { + type_name: "newtype variant", + }) } fn serialize_seq(self, _len: Option) -> Result { @@ -168,10 +184,11 @@ impl<'a> ser::Serializer for Indexer<'a> { fn serialize_tuple_struct( self, _name: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnindexableType { type_name: "tuple struct" }) + _len: usize, + ) -> Result { + Err(SerializerError::UnindexableType { + type_name: "tuple struct", + }) } fn serialize_tuple_variant( @@ -179,10 +196,11 @@ impl<'a> ser::Serializer for Indexer<'a> { _name: &'static str, _variant_index: u32, _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnindexableType { type_name: "tuple variant" }) + _len: usize, + ) -> Result { + Err(SerializerError::UnindexableType { + type_name: "tuple variant", + }) } fn serialize_map(self, _len: Option) -> Result { @@ -199,10 +217,11 @@ impl<'a> ser::Serializer for Indexer<'a> { fn serialize_struct( self, _name: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnindexableType { type_name: "struct" }) + _len: usize, + ) -> Result { + Err(SerializerError::UnindexableType { + type_name: "struct", + }) } fn serialize_struct_variant( @@ -210,10 +229,11 @@ impl<'a> ser::Serializer for Indexer<'a> { _name: &'static str, _variant_index: u32, _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnindexableType { type_name: "struct variant" }) + _len: usize, + ) -> Result { + Err(SerializerError::UnindexableType { + type_name: "struct variant", + }) } } @@ -229,7 +249,8 @@ impl<'a> ser::SerializeSeq for SeqIndexer<'a> { type Error = SerializerError; fn serialize_element(&mut self, value: &T) -> Result<(), Self::Error> - where T: ser::Serialize + where + T: ser::Serialize, { let text = value.serialize(ConvertToString)?; self.texts.push(text); @@ -238,7 +259,8 @@ impl<'a> ser::SerializeSeq for SeqIndexer<'a> { fn end(self) -> Result { let texts = self.texts.iter().map(String::as_str); - self.indexer.index_text_seq(self.document_id, self.attribute, texts); + self.indexer + .index_text_seq(self.document_id, self.attribute, texts); Ok(None) } } @@ -255,7 +277,8 @@ impl<'a> ser::SerializeMap for MapIndexer<'a> { type Error = SerializerError; fn serialize_key(&mut self, key: &T) -> Result<(), Self::Error> - where T: ser::Serialize, + where + T: ser::Serialize, { let text = key.serialize(ConvertToString)?; self.texts.push(text); @@ -263,7 +286,8 @@ impl<'a> ser::SerializeMap for MapIndexer<'a> { } fn serialize_value(&mut self, value: &T) -> Result<(), Self::Error> - where T: ser::Serialize, + where + T: ser::Serialize, { let text = value.serialize(ConvertToString)?; self.texts.push(text); @@ -272,7 +296,8 @@ impl<'a> ser::SerializeMap for MapIndexer<'a> { fn end(self) -> Result { let texts = self.texts.iter().map(String::as_str); - self.indexer.index_text_seq(self.document_id, self.attribute, texts); + self.indexer + .index_text_seq(self.document_id, self.attribute, texts); Ok(None) } } @@ -293,7 +318,8 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> { key: &'static str, value: &T, ) -> Result<(), Self::Error> - where T: ser::Serialize, + where + T: ser::Serialize, { let key_text = key.to_owned(); let value_text = value.serialize(ConvertToString)?; @@ -304,7 +330,8 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> { fn end(self) -> Result { let texts = self.texts.iter().map(String::as_str); - self.indexer.index_text_seq(self.document_id, self.attribute, texts); + self.indexer + .index_text_seq(self.document_id, self.attribute, texts); Ok(None) } } @@ -321,7 +348,8 @@ impl<'a> ser::SerializeTuple for TupleIndexer<'a> { type Error = SerializerError; fn serialize_element(&mut self, value: &T) -> Result<(), Self::Error> - where T: Serialize + where + T: Serialize, { let text = value.serialize(ConvertToString)?; self.texts.push(text); @@ -330,7 +358,8 @@ impl<'a> ser::SerializeTuple for TupleIndexer<'a> { fn end(self) -> Result { let texts = self.texts.iter().map(String::as_str); - self.indexer.index_text_seq(self.document_id, self.attribute, texts); + self.indexer + .index_text_seq(self.document_id, self.attribute, texts); Ok(None) } } diff --git a/meilidb-core/src/serde/mod.rs b/meilidb-core/src/serde/mod.rs index dde014435..c2feafbf0 100644 --- a/meilidb-core/src/serde/mod.rs +++ b/meilidb-core/src/serde/mod.rs @@ -15,19 +15,19 @@ mod extract_document_id; mod indexer; mod serializer; -pub use self::deserializer::{Deserializer, DeserializerError}; -pub use self::extract_document_id::{extract_document_id, compute_document_id, value_to_string}; -pub use self::convert_to_string::ConvertToString; pub use self::convert_to_number::ConvertToNumber; +pub use self::convert_to_string::ConvertToString; +pub use self::deserializer::{Deserializer, DeserializerError}; +pub use self::extract_document_id::{compute_document_id, extract_document_id, value_to_string}; pub use self::indexer::Indexer; pub use self::serializer::Serializer; use std::collections::BTreeMap; -use std::{fmt, error::Error}; +use std::{error::Error, fmt}; use meilidb_schema::SchemaAttr; -use serde_json::Error as SerdeJsonError; use serde::ser; +use serde_json::Error as SerdeJsonError; use crate::{DocumentId, ParseNumberError}; @@ -55,24 +55,24 @@ impl fmt::Display for SerializerError { match self { SerializerError::DocumentIdNotFound => { f.write_str("serialized document does not have an id according to the schema") - }, + } SerializerError::InvalidDocumentIdType => { f.write_str("document identifier can only be of type string or number") - }, + } SerializerError::Zlmdb(e) => write!(f, "zlmdb related error: {}", e), SerializerError::SerdeJson(e) => write!(f, "serde json error: {}", e), SerializerError::ParseNumber(e) => { write!(f, "error while trying to parse a number: {}", e) - }, + } SerializerError::UnserializableType { type_name } => { write!(f, "{} is not a serializable type", type_name) - }, + } SerializerError::UnindexableType { type_name } => { write!(f, "{} is not an indexable type", type_name) - }, + } SerializerError::UnrankableType { type_name } => { write!(f, "{} types can not be used for ranking", type_name) - }, + } SerializerError::Custom(s) => f.write_str(s), } } @@ -119,3 +119,9 @@ impl RamDocumentStore { self.0 } } + +impl Default for RamDocumentStore { + fn default() -> Self { + Self::new() + } +} diff --git a/meilidb-core/src/serde/serializer.rs b/meilidb-core/src/serde/serializer.rs index de09c57bc..fa197f620 100644 --- a/meilidb-core/src/serde/serializer.rs +++ b/meilidb-core/src/serde/serializer.rs @@ -1,12 +1,12 @@ -use std::collections::HashMap; use meilidb_schema::{Schema, SchemaAttr}; use serde::ser; +use std::collections::HashMap; -use crate::{DocumentId, RankedMap}; use crate::raw_indexer::RawIndexer; use crate::serde::RamDocumentStore; +use crate::{DocumentId, RankedMap}; -use super::{SerializerError, ConvertToString, ConvertToNumber, Indexer}; +use super::{ConvertToNumber, ConvertToString, Indexer, SerializerError}; pub struct Serializer<'a> { pub schema: &'a Schema, @@ -55,13 +55,18 @@ impl<'a> ser::Serializer for Serializer<'a> { } fn serialize_none(self) -> Result { - Err(SerializerError::UnserializableType { type_name: "Option" }) + Err(SerializerError::UnserializableType { + type_name: "Option", + }) } fn serialize_some(self, _value: &T) -> Result - where T: ser::Serialize, + where + T: ser::Serialize, { - Err(SerializerError::UnserializableType { type_name: "Option" }) + Err(SerializerError::UnserializableType { + type_name: "Option", + }) } fn serialize_unit(self) -> Result { @@ -69,25 +74,29 @@ impl<'a> ser::Serializer for Serializer<'a> { } fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(SerializerError::UnserializableType { type_name: "unit struct" }) + Err(SerializerError::UnserializableType { + type_name: "unit struct", + }) } fn serialize_unit_variant( self, _name: &'static str, _variant_index: u32, - _variant: &'static str - ) -> Result - { - Err(SerializerError::UnserializableType { type_name: "unit variant" }) + _variant: &'static str, + ) -> Result { + Err(SerializerError::UnserializableType { + type_name: "unit variant", + }) } fn serialize_newtype_struct( self, _name: &'static str, - value: &T + value: &T, ) -> Result - where T: ser::Serialize, + where + T: ser::Serialize, { value.serialize(self) } @@ -97,15 +106,20 @@ impl<'a> ser::Serializer for Serializer<'a> { _name: &'static str, _variant_index: u32, _variant: &'static str, - _value: &T + _value: &T, ) -> Result - where T: ser::Serialize, + where + T: ser::Serialize, { - Err(SerializerError::UnserializableType { type_name: "newtype variant" }) + Err(SerializerError::UnserializableType { + type_name: "newtype variant", + }) } fn serialize_seq(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { type_name: "sequence" }) + Err(SerializerError::UnserializableType { + type_name: "sequence", + }) } fn serialize_tuple(self, _len: usize) -> Result { @@ -115,10 +129,11 @@ impl<'a> ser::Serializer for Serializer<'a> { fn serialize_tuple_struct( self, _name: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { type_name: "tuple struct" }) + _len: usize, + ) -> Result { + Err(SerializerError::UnserializableType { + type_name: "tuple struct", + }) } fn serialize_tuple_variant( @@ -126,10 +141,11 @@ impl<'a> ser::Serializer for Serializer<'a> { _name: &'static str, _variant_index: u32, _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { type_name: "tuple variant" }) + _len: usize, + ) -> Result { + Err(SerializerError::UnserializableType { + type_name: "tuple variant", + }) } fn serialize_map(self, _len: Option) -> Result { @@ -147,9 +163,8 @@ impl<'a> ser::Serializer for Serializer<'a> { fn serialize_struct( self, _name: &'static str, - _len: usize - ) -> Result - { + _len: usize, + ) -> Result { Ok(StructSerializer { schema: self.schema, document_id: self.document_id, @@ -165,10 +180,11 @@ impl<'a> ser::Serializer for Serializer<'a> { _name: &'static str, _variant_index: u32, _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { type_name: "struct variant" }) + _len: usize, + ) -> Result { + Err(SerializerError::UnserializableType { + type_name: "struct variant", + }) } } @@ -187,7 +203,8 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> { type Error = SerializerError; fn serialize_key(&mut self, key: &T) -> Result<(), Self::Error> - where T: ser::Serialize, + where + T: ser::Serialize, { let key = key.serialize(ConvertToString)?; self.current_key_name = Some(key); @@ -195,7 +212,8 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> { } fn serialize_value(&mut self, value: &T) -> Result<(), Self::Error> - where T: ser::Serialize, + where + T: ser::Serialize, { let key = self.current_key_name.take().unwrap(); self.serialize_entry(&key, value) @@ -206,7 +224,9 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> { key: &K, value: &V, ) -> Result<(), Self::Error> - where K: ser::Serialize, V: ser::Serialize, + where + K: ser::Serialize, + V: ser::Serialize, { let key = key.serialize(ConvertToString)?; @@ -245,7 +265,8 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> { key: &'static str, value: &T, ) -> Result<(), Self::Error> - where T: ser::Serialize, + where + T: ser::Serialize, { serialize_value( self.schema, @@ -274,7 +295,8 @@ fn serialize_value( key: &str, value: &T, ) -> Result<(), SerializerError> -where T: ser::Serialize, +where + T: ser::Serialize, { if let Some(attribute) = schema.attribute(key) { let props = schema.props(attribute); @@ -283,7 +305,11 @@ where T: ser::Serialize, document_store.set_document_field(document_id, attribute, serialized); if props.is_indexed() { - let indexer = Indexer { attribute, indexer, document_id }; + let indexer = Indexer { + attribute, + indexer, + document_id, + }; if let Some(number_of_words) = value.serialize(indexer)? { documents_fields_counts.insert((document_id, attribute), number_of_words as u64); } diff --git a/meilidb-core/src/store/docs_words.rs b/meilidb-core/src/store/docs_words.rs index 6cdf555d1..93d4192e3 100644 --- a/meilidb-core/src/store/docs_words.rs +++ b/meilidb-core/src/store/docs_words.rs @@ -1,8 +1,8 @@ -use std::sync::Arc; -use zlmdb::types::{OwnedType, ByteSlice}; -use zlmdb::Result as ZResult; -use crate::DocumentId; use super::BEU64; +use crate::DocumentId; +use std::sync::Arc; +use zlmdb::types::{ByteSlice, OwnedType}; +use zlmdb::Result as ZResult; #[derive(Copy, Clone)] pub struct DocsWords { @@ -11,33 +11,30 @@ pub struct DocsWords { impl DocsWords { pub fn put_doc_words( - &self, + self, writer: &mut zlmdb::RwTxn, document_id: DocumentId, words: &fst::Set, - ) -> ZResult<()> - { + ) -> ZResult<()> { let document_id = BEU64::new(document_id.0); let bytes = words.as_fst().as_bytes(); self.docs_words.put(writer, &document_id, bytes) } pub fn del_doc_words( - &self, + self, writer: &mut zlmdb::RwTxn, document_id: DocumentId, - ) -> ZResult - { + ) -> ZResult { let document_id = BEU64::new(document_id.0); self.docs_words.delete(writer, &document_id) } pub fn doc_words( - &self, + self, reader: &zlmdb::RoTxn, document_id: DocumentId, - ) -> ZResult> - { + ) -> ZResult> { let document_id = BEU64::new(document_id.0); match self.docs_words.get(reader, &document_id)? { Some(bytes) => { @@ -45,7 +42,7 @@ impl DocsWords { let bytes = Arc::from(bytes); let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap(); Ok(Some(fst::Set::from(fst))) - }, + } None => Ok(None), } } diff --git a/meilidb-core/src/store/documents_fields.rs b/meilidb-core/src/store/documents_fields.rs index 7b528af97..84b53351b 100644 --- a/meilidb-core/src/store/documents_fields.rs +++ b/meilidb-core/src/store/documents_fields.rs @@ -1,9 +1,9 @@ use meilidb_schema::SchemaAttr; -use zlmdb::types::{OwnedType, ByteSlice}; +use zlmdb::types::{ByteSlice, OwnedType}; use zlmdb::Result as ZResult; -use crate::DocumentId; use super::DocumentAttrKey; +use crate::DocumentId; #[derive(Copy, Clone)] pub struct DocumentsFields { @@ -12,45 +12,41 @@ pub struct DocumentsFields { impl DocumentsFields { pub fn put_document_field( - &self, + self, writer: &mut zlmdb::RwTxn, document_id: DocumentId, attribute: SchemaAttr, value: &[u8], - ) -> ZResult<()> - { + ) -> ZResult<()> { let key = DocumentAttrKey::new(document_id, attribute); self.documents_fields.put(writer, &key, value) } pub fn del_all_document_fields( - &self, + self, writer: &mut zlmdb::RwTxn, document_id: DocumentId, - ) -> ZResult - { + ) -> ZResult { let start = DocumentAttrKey::new(document_id, SchemaAttr::min()); let end = DocumentAttrKey::new(document_id, SchemaAttr::max()); self.documents_fields.delete_range(writer, start..=end) } pub fn document_attribute<'txn>( - &self, + self, reader: &'txn zlmdb::RoTxn, document_id: DocumentId, attribute: SchemaAttr, - ) -> ZResult> - { + ) -> ZResult> { let key = DocumentAttrKey::new(document_id, attribute); self.documents_fields.get(reader, &key) } pub fn document_fields<'txn>( - &self, + self, reader: &'txn zlmdb::RoTxn, document_id: DocumentId, - ) -> ZResult> - { + ) -> ZResult> { let start = DocumentAttrKey::new(document_id, SchemaAttr::min()); let end = DocumentAttrKey::new(document_id, SchemaAttr::max()); let iter = self.documents_fields.range(reader, start..=end)?; @@ -70,8 +66,8 @@ impl<'txn> Iterator for DocumentFieldsIter<'txn> { Some(Ok((key, bytes))) => { let attr = SchemaAttr(key.attr.get()); Some(Ok((attr, bytes))) - }, - Some(Err(e)) => Some(Err(e.into())), + } + Some(Err(e)) => Some(Err(e)), None => None, } } diff --git a/meilidb-core/src/store/documents_fields_counts.rs b/meilidb-core/src/store/documents_fields_counts.rs index c075f703b..b765c8f25 100644 --- a/meilidb-core/src/store/documents_fields_counts.rs +++ b/meilidb-core/src/store/documents_fields_counts.rs @@ -1,8 +1,8 @@ +use super::DocumentAttrKey; +use crate::DocumentId; use meilidb_schema::SchemaAttr; use zlmdb::types::OwnedType; use zlmdb::Result as ZResult; -use crate::DocumentId; -use super::DocumentAttrKey; #[derive(Copy, Clone)] pub struct DocumentsFieldsCounts { @@ -11,35 +11,33 @@ pub struct DocumentsFieldsCounts { impl DocumentsFieldsCounts { pub fn put_document_field_count( - &self, + self, writer: &mut zlmdb::RwTxn, document_id: DocumentId, attribute: SchemaAttr, value: u64, - ) -> ZResult<()> - { + ) -> ZResult<()> { let key = DocumentAttrKey::new(document_id, attribute); self.documents_fields_counts.put(writer, &key, &value) } pub fn del_all_document_fields_counts( - &self, + self, writer: &mut zlmdb::RwTxn, document_id: DocumentId, - ) -> ZResult - { + ) -> ZResult { let start = DocumentAttrKey::new(document_id, SchemaAttr::min()); let end = DocumentAttrKey::new(document_id, SchemaAttr::max()); - self.documents_fields_counts.delete_range(writer, start..=end) + self.documents_fields_counts + .delete_range(writer, start..=end) } pub fn document_field_count( - &self, + self, reader: &zlmdb::RoTxn, document_id: DocumentId, attribute: SchemaAttr, - ) -> ZResult> - { + ) -> ZResult> { let key = DocumentAttrKey::new(document_id, attribute); match self.documents_fields_counts.get(reader, &key)? { Some(count) => Ok(Some(count)), @@ -48,11 +46,10 @@ impl DocumentsFieldsCounts { } pub fn document_fields_counts<'txn>( - &self, + self, reader: &'txn zlmdb::RoTxn, document_id: DocumentId, - ) -> ZResult> - { + ) -> ZResult> { let start = DocumentAttrKey::new(document_id, SchemaAttr::min()); let end = DocumentAttrKey::new(document_id, SchemaAttr::max()); let iter = self.documents_fields_counts.range(reader, start..=end)?; @@ -60,19 +57,20 @@ impl DocumentsFieldsCounts { } pub fn documents_ids<'txn>( - &self, + self, reader: &'txn zlmdb::RoTxn, - ) -> ZResult> - { + ) -> ZResult> { let iter = self.documents_fields_counts.iter(reader)?; - Ok(DocumentsIdsIter { last_seen_id: None, iter }) + Ok(DocumentsIdsIter { + last_seen_id: None, + iter, + }) } pub fn all_documents_fields_counts<'txn>( - &self, + self, reader: &'txn zlmdb::RoTxn, - ) -> ZResult> - { + ) -> ZResult> { let iter = self.documents_fields_counts.iter(reader)?; Ok(AllDocumentsFieldsCountsIter { iter }) } @@ -90,8 +88,8 @@ impl Iterator for DocumentFieldsCountsIter<'_> { Some(Ok((key, count))) => { let attr = SchemaAttr(key.attr.get()); Some(Ok((attr, count))) - }, - Some(Err(e)) => Some(Err(e.into())), + } + Some(Err(e)) => Some(Err(e)), None => None, } } @@ -112,10 +110,10 @@ impl Iterator for DocumentsIdsIter<'_> { let document_id = DocumentId(key.docid.get()); if Some(document_id) != self.last_seen_id { self.last_seen_id = Some(document_id); - return Some(Ok(document_id)) + return Some(Ok(document_id)); } - }, - Err(e) => return Some(Err(e.into())), + } + Err(e) => return Some(Err(e)), } } None @@ -135,8 +133,8 @@ impl<'r> Iterator for AllDocumentsFieldsCountsIter<'r> { let docid = DocumentId(key.docid.get()); let attr = SchemaAttr(key.attr.get()); Some(Ok((docid, attr, count))) - }, - Some(Err(e)) => Some(Err(e.into())), + } + Some(Err(e)) => Some(Err(e)), None => None, } } diff --git a/meilidb-core/src/store/main.rs b/meilidb-core/src/store/main.rs index 085609e4f..6e99ac067 100644 --- a/meilidb-core/src/store/main.rs +++ b/meilidb-core/src/store/main.rs @@ -1,15 +1,15 @@ -use std::sync::Arc; -use meilidb_schema::Schema; -use zlmdb::types::{Str, OwnedType, ByteSlice, Serde}; -use zlmdb::Result as ZResult; use crate::RankedMap; +use meilidb_schema::Schema; +use std::sync::Arc; +use zlmdb::types::{ByteSlice, OwnedType, Serde, Str}; +use zlmdb::Result as ZResult; -const CUSTOMS_KEY: &str = "customs-key"; +const CUSTOMS_KEY: &str = "customs-key"; const NUMBER_OF_DOCUMENTS_KEY: &str = "number-of-documents"; -const RANKED_MAP_KEY: &str = "ranked-map"; -const SCHEMA_KEY: &str = "schema"; -const SYNONYMS_KEY: &str = "synonyms"; -const WORDS_KEY: &str = "words"; +const RANKED_MAP_KEY: &str = "ranked-map"; +const SCHEMA_KEY: &str = "schema"; +const SYNONYMS_KEY: &str = "synonyms"; +const WORDS_KEY: &str = "words"; #[derive(Copy, Clone)] pub struct Main { @@ -17,76 +17,85 @@ pub struct Main { } impl Main { - pub fn put_words_fst(&self, writer: &mut zlmdb::RwTxn, fst: &fst::Set) -> ZResult<()> { + pub fn put_words_fst(self, writer: &mut zlmdb::RwTxn, fst: &fst::Set) -> ZResult<()> { let bytes = fst.as_fst().as_bytes(); self.main.put::(writer, WORDS_KEY, bytes) } - pub fn words_fst(&self, reader: &zlmdb::RoTxn) -> ZResult> { + pub fn words_fst(self, reader: &zlmdb::RoTxn) -> ZResult> { match self.main.get::(reader, WORDS_KEY)? { Some(bytes) => { let len = bytes.len(); let bytes = Arc::from(bytes); let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap(); Ok(Some(fst::Set::from(fst))) - }, + } None => Ok(None), } } - pub fn put_schema(&self, writer: &mut zlmdb::RwTxn, schema: &Schema) -> ZResult<()> { - self.main.put::>(writer, SCHEMA_KEY, schema) + pub fn put_schema(self, writer: &mut zlmdb::RwTxn, schema: &Schema) -> ZResult<()> { + self.main + .put::>(writer, SCHEMA_KEY, schema) } - pub fn schema(&self, reader: &zlmdb::RoTxn) -> ZResult> { + pub fn schema(self, reader: &zlmdb::RoTxn) -> ZResult> { self.main.get::>(reader, SCHEMA_KEY) } - pub fn put_ranked_map(&self, writer: &mut zlmdb::RwTxn, ranked_map: &RankedMap) -> ZResult<()> { - self.main.put::>(writer, RANKED_MAP_KEY, &ranked_map) + pub fn put_ranked_map(self, writer: &mut zlmdb::RwTxn, ranked_map: &RankedMap) -> ZResult<()> { + self.main + .put::>(writer, RANKED_MAP_KEY, &ranked_map) } - pub fn ranked_map(&self, reader: &zlmdb::RoTxn) -> ZResult> { - self.main.get::>(reader, RANKED_MAP_KEY) + pub fn ranked_map(self, reader: &zlmdb::RoTxn) -> ZResult> { + self.main + .get::>(reader, RANKED_MAP_KEY) } - pub fn put_synonyms_fst(&self, writer: &mut zlmdb::RwTxn, fst: &fst::Set) -> ZResult<()> { + pub fn put_synonyms_fst(self, writer: &mut zlmdb::RwTxn, fst: &fst::Set) -> ZResult<()> { let bytes = fst.as_fst().as_bytes(); self.main.put::(writer, SYNONYMS_KEY, bytes) } - pub fn synonyms_fst(&self, reader: &zlmdb::RoTxn) -> ZResult> { + pub fn synonyms_fst(self, reader: &zlmdb::RoTxn) -> ZResult> { match self.main.get::(reader, SYNONYMS_KEY)? { Some(bytes) => { let len = bytes.len(); let bytes = Arc::from(bytes); let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap(); Ok(Some(fst::Set::from(fst))) - }, + } None => Ok(None), } } - pub fn put_number_of_documents(&self, writer: &mut zlmdb::RwTxn, f: F) -> ZResult - where F: Fn(u64) -> u64, + pub fn put_number_of_documents(self, writer: &mut zlmdb::RwTxn, f: F) -> ZResult + where + F: Fn(u64) -> u64, { let new = self.number_of_documents(writer).map(f)?; - self.main.put::>(writer, NUMBER_OF_DOCUMENTS_KEY, &new)?; + self.main + .put::>(writer, NUMBER_OF_DOCUMENTS_KEY, &new)?; Ok(new) } - pub fn number_of_documents(&self, reader: &zlmdb::RoTxn) -> ZResult { - match self.main.get::>(reader, NUMBER_OF_DOCUMENTS_KEY)? { + pub fn number_of_documents(self, reader: &zlmdb::RoTxn) -> ZResult { + match self + .main + .get::>(reader, NUMBER_OF_DOCUMENTS_KEY)? + { Some(value) => Ok(value), None => Ok(0), } } - pub fn put_customs(&self, writer: &mut zlmdb::RwTxn, customs: &[u8]) -> ZResult<()> { - self.main.put::(writer, CUSTOMS_KEY, customs) + pub fn put_customs(self, writer: &mut zlmdb::RwTxn, customs: &[u8]) -> ZResult<()> { + self.main + .put::(writer, CUSTOMS_KEY, customs) } - pub fn customs<'txn>(&self, reader: &'txn zlmdb::RoTxn) -> ZResult> { + pub fn customs<'txn>(self, reader: &'txn zlmdb::RoTxn) -> ZResult> { self.main.get::(reader, CUSTOMS_KEY) } } diff --git a/meilidb-core/src/store/mod.rs b/meilidb-core/src/store/mod.rs index fee4e8d8a..68170e64e 100644 --- a/meilidb-core/src/store/mod.rs +++ b/meilidb-core/src/store/mod.rs @@ -8,8 +8,10 @@ mod updates; mod updates_results; pub use self::docs_words::DocsWords; -pub use self::documents_fields::{DocumentsFields, DocumentFieldsIter}; -pub use self::documents_fields_counts::{DocumentsFieldsCounts, DocumentFieldsCountsIter, DocumentsIdsIter}; +pub use self::documents_fields::{DocumentFieldsIter, DocumentsFields}; +pub use self::documents_fields_counts::{ + DocumentFieldsCountsIter, DocumentsFieldsCounts, DocumentsIdsIter, +}; pub use self::main::Main; pub use self::postings_lists::PostingsLists; pub use self::synonyms::Synonyms; @@ -25,19 +27,24 @@ use zlmdb::Result as ZResult; use crate::criterion::Criteria; use crate::serde::Deserializer; -use crate::{update, query_builder::QueryBuilder, DocumentId, MResult, Error}; +use crate::{query_builder::QueryBuilder, update, DocumentId, Error, MResult}; type BEU64 = zerocopy::U64; type BEU16 = zerocopy::U16; -#[derive(Debug, Copy, Clone)] -#[derive(AsBytes, FromBytes)] +#[derive(Debug, Copy, Clone, AsBytes, FromBytes)] #[repr(C)] -pub struct DocumentAttrKey { docid: BEU64, attr: BEU16 } +pub struct DocumentAttrKey { + docid: BEU64, + attr: BEU16, +} impl DocumentAttrKey { fn new(docid: DocumentId, attr: SchemaAttr) -> DocumentAttrKey { - DocumentAttrKey { docid: BEU64::new(docid.0), attr: BEU16::new(attr.0) } + DocumentAttrKey { + docid: BEU64::new(docid.0), + attr: BEU16::new(attr.0), + } } } @@ -93,13 +100,15 @@ impl Index { reader: &zlmdb::RoTxn, attributes: Option<&HashSet<&str>>, document_id: DocumentId, - ) -> MResult> - { + ) -> MResult> { let schema = self.main.schema(reader)?; let schema = schema.ok_or(Error::SchemaMissing)?; let attributes = match attributes { - Some(attributes) => attributes.into_iter().map(|name| schema.attribute(name)).collect(), + Some(attributes) => attributes + .iter() + .map(|name| schema.attribute(name)) + .collect(), None => None, }; @@ -121,9 +130,10 @@ impl Index { reader: &zlmdb::RoTxn, document_id: DocumentId, attribute: SchemaAttr, - ) -> MResult> - { - let bytes = self.documents_fields.document_attribute(reader, document_id, attribute)?; + ) -> MResult> { + let bytes = self + .documents_fields + .document_attribute(reader, document_id, attribute)?; match bytes { Some(bytes) => Ok(Some(serde_json::from_slice(bytes)?)), None => Ok(None), @@ -183,14 +193,8 @@ impl Index { &self, reader: &zlmdb::RoTxn, update_id: u64, - ) -> MResult - { - update::update_status( - reader, - self.updates, - self.updates_results, - update_id, - ) + ) -> MResult { + update::update_status(reader, self.updates, self.updates_results, update_id) } pub fn query_builder(&self) -> QueryBuilder { @@ -205,8 +209,7 @@ impl Index { pub fn query_builder_with_criteria<'c, 'f, 'd>( &self, criteria: Criteria<'c>, - ) -> QueryBuilder<'c, 'f, 'd> - { + ) -> QueryBuilder<'c, 'f, 'd> { QueryBuilder::with_criteria( self.main, self.postings_lists, @@ -221,8 +224,7 @@ pub fn create( env: &zlmdb::Env, name: &str, updates_notifier: crossbeam_channel::Sender<()>, -) -> MResult -{ +) -> MResult { // create all the store names let main_name = main_name(name); let postings_lists_name = postings_lists_name(name); @@ -247,7 +249,9 @@ pub fn create( main: Main { main }, postings_lists: PostingsLists { postings_lists }, documents_fields: DocumentsFields { documents_fields }, - documents_fields_counts: DocumentsFieldsCounts { documents_fields_counts }, + documents_fields_counts: DocumentsFieldsCounts { + documents_fields_counts, + }, synonyms: Synonyms { synonyms }, docs_words: DocsWords { docs_words }, updates: Updates { updates }, @@ -260,8 +264,7 @@ pub fn open( env: &zlmdb::Env, name: &str, updates_notifier: crossbeam_channel::Sender<()>, -) -> MResult> -{ +) -> MResult> { // create all the store names let main_name = main_name(name); let postings_lists_name = postings_lists_name(name); @@ -310,7 +313,9 @@ pub fn open( main: Main { main }, postings_lists: PostingsLists { postings_lists }, documents_fields: DocumentsFields { documents_fields }, - documents_fields_counts: DocumentsFieldsCounts { documents_fields_counts }, + documents_fields_counts: DocumentsFieldsCounts { + documents_fields_counts, + }, synonyms: Synonyms { synonyms }, docs_words: DocsWords { docs_words }, updates: Updates { updates }, diff --git a/meilidb-core/src/store/postings_lists.rs b/meilidb-core/src/store/postings_lists.rs index eb917df5b..8835a504a 100644 --- a/meilidb-core/src/store/postings_lists.rs +++ b/meilidb-core/src/store/postings_lists.rs @@ -1,8 +1,8 @@ -use std::borrow::Cow; +use crate::DocIndex; use sdset::{Set, SetBuf}; +use std::borrow::Cow; use zlmdb::types::{ByteSlice, CowSlice}; use zlmdb::Result as ZResult; -use crate::DocIndex; #[derive(Copy, Clone)] pub struct PostingsLists { @@ -11,25 +11,23 @@ pub struct PostingsLists { impl PostingsLists { pub fn put_postings_list( - &self, + self, writer: &mut zlmdb::RwTxn, word: &[u8], words_indexes: &Set, - ) -> ZResult<()> - { + ) -> ZResult<()> { self.postings_lists.put(writer, word, words_indexes) } - pub fn del_postings_list(&self, writer: &mut zlmdb::RwTxn, word: &[u8]) -> ZResult { + pub fn del_postings_list(self, writer: &mut zlmdb::RwTxn, word: &[u8]) -> ZResult { self.postings_lists.delete(writer, word) } pub fn postings_list<'txn>( - &self, + self, reader: &'txn zlmdb::RoTxn, word: &[u8], - ) -> ZResult>>> - { + ) -> ZResult>>> { match self.postings_lists.get(reader, word)? { Some(Cow::Borrowed(slice)) => Ok(Some(Cow::Borrowed(Set::new_unchecked(slice)))), Some(Cow::Owned(vec)) => Ok(Some(Cow::Owned(SetBuf::new_unchecked(vec)))), diff --git a/meilidb-core/src/store/synonyms.rs b/meilidb-core/src/store/synonyms.rs index ca032c223..b8002d464 100644 --- a/meilidb-core/src/store/synonyms.rs +++ b/meilidb-core/src/store/synonyms.rs @@ -9,28 +9,27 @@ pub struct Synonyms { impl Synonyms { pub fn put_synonyms( - &self, + self, writer: &mut zlmdb::RwTxn, word: &[u8], synonyms: &fst::Set, - ) -> ZResult<()> - { + ) -> ZResult<()> { let bytes = synonyms.as_fst().as_bytes(); self.synonyms.put(writer, word, bytes) } - pub fn del_synonyms(&self, writer: &mut zlmdb::RwTxn, word: &[u8]) -> ZResult { + pub fn del_synonyms(self, writer: &mut zlmdb::RwTxn, word: &[u8]) -> ZResult { self.synonyms.delete(writer, word) } - pub fn synonyms(&self, reader: &zlmdb::RoTxn, word: &[u8]) -> ZResult> { + pub fn synonyms(self, reader: &zlmdb::RoTxn, word: &[u8]) -> ZResult> { match self.synonyms.get(reader, word)? { Some(bytes) => { let len = bytes.len(); let bytes = Arc::from(bytes); let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap(); Ok(Some(fst::Set::from(fst))) - }, + } None => Ok(None), } } diff --git a/meilidb-core/src/store/updates.rs b/meilidb-core/src/store/updates.rs index 7ec3b48aa..8afe95e0d 100644 --- a/meilidb-core/src/store/updates.rs +++ b/meilidb-core/src/store/updates.rs @@ -1,13 +1,16 @@ +use super::BEU64; +use crate::update::Update; +use serde::{Deserialize, Serialize}; use std::borrow::Cow; use zlmdb::types::OwnedType; -use zlmdb::{Result as ZResult, BytesEncode, BytesDecode}; -use serde::{Serialize, Deserialize}; -use crate::update::Update; -use super::BEU64; +use zlmdb::{BytesDecode, BytesEncode, Result as ZResult}; pub struct SerdeJson(std::marker::PhantomData); -impl BytesEncode for SerdeJson where T: Serialize { +impl BytesEncode for SerdeJson +where + T: Serialize, +{ type EItem = T; fn bytes_encode(item: &Self::EItem) -> Option> { @@ -15,7 +18,10 @@ impl BytesEncode for SerdeJson where T: Serialize { } } -impl<'a, T: 'a> BytesDecode<'a> for SerdeJson where T: Deserialize<'a> + Clone { +impl<'a, T: 'a> BytesDecode<'a> for SerdeJson +where + T: Deserialize<'a> + Clone, +{ type DItem = T; fn bytes_decode(bytes: &'a [u8]) -> Option { @@ -30,7 +36,7 @@ pub struct Updates { impl Updates { // TODO do not trigger deserialize if possible - pub fn last_update_id(&self, reader: &zlmdb::RoTxn) -> ZResult> { + pub fn last_update_id(self, reader: &zlmdb::RoTxn) -> ZResult> { match self.updates.last(reader)? { Some((key, data)) => Ok(Some((key.get(), data))), None => Ok(None), @@ -38,7 +44,7 @@ impl Updates { } // TODO do not trigger deserialize if possible - fn first_update_id(&self, reader: &zlmdb::RoTxn) -> ZResult> { + fn first_update_id(self, reader: &zlmdb::RoTxn) -> ZResult> { match self.updates.first(reader)? { Some((key, data)) => Ok(Some((key.get(), data))), None => Ok(None), @@ -46,31 +52,30 @@ impl Updates { } // TODO do not trigger deserialize if possible - pub fn contains(&self, reader: &zlmdb::RoTxn, update_id: u64) -> ZResult { + pub fn contains(self, reader: &zlmdb::RoTxn, update_id: u64) -> ZResult { let update_id = BEU64::new(update_id); self.updates.get(reader, &update_id).map(|v| v.is_some()) } pub fn put_update( - &self, + self, writer: &mut zlmdb::RwTxn, update_id: u64, update: &Update, - ) -> ZResult<()> - { + ) -> ZResult<()> { // TODO prefer using serde_json? let update_id = BEU64::new(update_id); self.updates.put(writer, &update_id, update) } - pub fn pop_front(&self, writer: &mut zlmdb::RwTxn) -> ZResult> { + pub fn pop_front(self, writer: &mut zlmdb::RwTxn) -> ZResult> { match self.first_update_id(writer)? { Some((update_id, update)) => { let key = BEU64::new(update_id); self.updates.delete(writer, &key)?; Ok(Some((update_id, update))) - }, - None => Ok(None) + } + None => Ok(None), } } } diff --git a/meilidb-core/src/store/updates_results.rs b/meilidb-core/src/store/updates_results.rs index 8deeb2f5b..cd3c96075 100644 --- a/meilidb-core/src/store/updates_results.rs +++ b/meilidb-core/src/store/updates_results.rs @@ -1,7 +1,7 @@ +use super::BEU64; +use crate::update::UpdateResult; use zlmdb::types::{OwnedType, Serde}; use zlmdb::Result as ZResult; -use crate::update::UpdateResult; -use super::BEU64; #[derive(Copy, Clone)] pub struct UpdatesResults { @@ -9,7 +9,7 @@ pub struct UpdatesResults { } impl UpdatesResults { - pub fn last_update_id(&self, reader: &zlmdb::RoTxn) -> ZResult> { + pub fn last_update_id(self, reader: &zlmdb::RoTxn) -> ZResult> { match self.updates_results.last(reader)? { Some((key, data)) => Ok(Some((key.get(), data))), None => Ok(None), @@ -17,22 +17,20 @@ impl UpdatesResults { } pub fn put_update_result( - &self, + self, writer: &mut zlmdb::RwTxn, update_id: u64, update_result: &UpdateResult, - ) -> ZResult<()> - { + ) -> ZResult<()> { let update_id = BEU64::new(update_id); self.updates_results.put(writer, &update_id, update_result) } pub fn update_result( - &self, + self, reader: &zlmdb::RoTxn, update_id: u64, - ) -> ZResult> - { + ) -> ZResult> { let update_id = BEU64::new(update_id); self.updates_results.get(reader, &update_id) } diff --git a/meilidb-core/src/update/customs_update.rs b/meilidb-core/src/update/customs_update.rs index 2303b8689..5072dc096 100644 --- a/meilidb-core/src/update/customs_update.rs +++ b/meilidb-core/src/update/customs_update.rs @@ -1,13 +1,12 @@ -use zlmdb::Result as ZResult; -use crate::update::{Update, next_update_id}; use crate::store; +use crate::update::{next_update_id, Update}; +use zlmdb::Result as ZResult; pub fn apply_customs_update( writer: &mut zlmdb::RwTxn, main_store: store::Main, customs: &[u8], -) -> ZResult<()> -{ +) -> ZResult<()> { main_store.put_customs(writer, customs) } @@ -16,8 +15,7 @@ pub fn push_customs_update( updates_store: store::Updates, updates_results_store: store::UpdatesResults, customs: Vec, -) -> ZResult -{ +) -> ZResult { let last_update_id = next_update_id(writer, updates_store, updates_results_store)?; let update = Update::Customs(customs); diff --git a/meilidb-core/src/update/documents_addition.rs b/meilidb-core/src/update/documents_addition.rs index e57812f0d..22769fbe2 100644 --- a/meilidb-core/src/update/documents_addition.rs +++ b/meilidb-core/src/update/documents_addition.rs @@ -1,14 +1,14 @@ use std::collections::{HashMap, HashSet}; -use fst::{SetBuilder, set::OpBuilder}; -use sdset::{SetOperation, duo::Union}; +use fst::{set::OpBuilder, SetBuilder}; +use sdset::{duo::Union, SetOperation}; use serde::Serialize; use crate::raw_indexer::RawIndexer; -use crate::serde::{extract_document_id, Serializer, RamDocumentStore}; +use crate::serde::{extract_document_id, RamDocumentStore, Serializer}; use crate::store; -use crate::update::{Update, next_update_id, apply_documents_deletion}; -use crate::{MResult, Error, RankedMap}; +use crate::update::{apply_documents_deletion, next_update_id, Update}; +use crate::{Error, MResult, RankedMap}; pub struct DocumentsAddition { updates_store: store::Updates, @@ -22,8 +22,7 @@ impl DocumentsAddition { updates_store: store::Updates, updates_results_store: store::UpdatesResults, updates_notifier: crossbeam_channel::Sender<()>, - ) -> DocumentsAddition - { + ) -> DocumentsAddition { DocumentsAddition { updates_store, updates_results_store, @@ -37,7 +36,8 @@ impl DocumentsAddition { } pub fn finalize(self, writer: &mut zlmdb::RwTxn) -> MResult - where D: serde::Serialize + where + D: serde::Serialize, { let _ = self.updates_notifier.send(()); let update_id = push_documents_addition( @@ -51,7 +51,7 @@ impl DocumentsAddition { } impl Extend for DocumentsAddition { - fn extend>(&mut self, iter: T) { + fn extend>(&mut self, iter: T) { self.documents.extend(iter) } } @@ -61,8 +61,7 @@ pub fn push_documents_addition( updates_store: store::Updates, updates_results_store: store::UpdatesResults, addition: Vec, -) -> MResult -{ +) -> MResult { let mut values = Vec::with_capacity(addition.len()); for add in addition { let vec = serde_json::to_vec(&add)?; @@ -87,8 +86,7 @@ pub fn apply_documents_addition( docs_words_store: store::DocsWords, mut ranked_map: RankedMap, addition: Vec, -) -> MResult<()> -{ +) -> MResult<()> { let mut document_ids = HashSet::new(); let mut document_store = RamDocumentStore::new(); let mut document_fields_counts = HashMap::new(); @@ -182,7 +180,7 @@ pub fn apply_documents_addition( .into_inner() .and_then(fst::Set::from_bytes) .unwrap() - }, + } None => delta_words, }; diff --git a/meilidb-core/src/update/documents_deletion.rs b/meilidb-core/src/update/documents_deletion.rs index 72c03f741..e640cb508 100644 --- a/meilidb-core/src/update/documents_deletion.rs +++ b/meilidb-core/src/update/documents_deletion.rs @@ -1,13 +1,13 @@ -use std::collections::{HashMap, HashSet, BTreeSet}; +use std::collections::{BTreeSet, HashMap, HashSet}; use fst::{SetBuilder, Streamer}; use meilidb_schema::Schema; -use sdset::{SetBuf, SetOperation, duo::DifferenceByKey}; +use sdset::{duo::DifferenceByKey, SetBuf, SetOperation}; -use crate::{DocumentId, RankedMap, MResult, Error}; use crate::serde::extract_document_id; -use crate::update::{Update, next_update_id}; use crate::store; +use crate::update::{next_update_id, Update}; +use crate::{DocumentId, Error, MResult, RankedMap}; pub struct DocumentsDeletion { updates_store: store::Updates, @@ -21,8 +21,7 @@ impl DocumentsDeletion { updates_store: store::Updates, updates_results_store: store::UpdatesResults, updates_notifier: crossbeam_channel::Sender<()>, - ) -> DocumentsDeletion - { + ) -> DocumentsDeletion { DocumentsDeletion { updates_store, updates_results_store, @@ -36,7 +35,8 @@ impl DocumentsDeletion { } pub fn delete_document(&mut self, schema: &Schema, document: D) -> MResult<()> - where D: serde::Serialize, + where + D: serde::Serialize, { let identifier = schema.identifier_name(); let document_id = match extract_document_id(identifier, &document)? { @@ -62,7 +62,7 @@ impl DocumentsDeletion { } impl Extend for DocumentsDeletion { - fn extend>(&mut self, iter: T) { + fn extend>(&mut self, iter: T) { self.documents.extend(iter) } } @@ -72,8 +72,7 @@ pub fn push_documents_deletion( updates_store: store::Updates, updates_results_store: store::UpdatesResults, deletion: Vec, -) -> MResult -{ +) -> MResult { let last_update_id = next_update_id(writer, updates_store, updates_results_store)?; let update = Update::DocumentsDeletion(deletion); @@ -91,8 +90,7 @@ pub fn apply_documents_deletion( docs_words_store: store::DocsWords, mut ranked_map: RankedMap, deletion: Vec, -) -> MResult<()> -{ +) -> MResult<()> { let idset = SetBuf::from_dirty(deletion); let schema = match main_store.schema(writer)? { @@ -101,10 +99,17 @@ pub fn apply_documents_deletion( }; // collect the ranked attributes according to the schema - let ranked_attrs: Vec<_> = schema.iter() - .filter_map(|(_, attr, prop)| { - if prop.is_ranked() { Some(attr) } else { None } - }) + let ranked_attrs: Vec<_> = schema + .iter() + .filter_map( + |(_, attr, prop)| { + if prop.is_ranked() { + Some(attr) + } else { + None + } + }, + ) .collect(); let mut words_document_ids = HashMap::new(); @@ -118,7 +123,10 @@ pub fn apply_documents_deletion( let mut stream = words.stream(); while let Some(word) = stream.next() { let word = word.to_vec(); - words_document_ids.entry(word).or_insert_with(Vec::new).push(id); + words_document_ids + .entry(word) + .or_insert_with(Vec::new) + .push(id); } } } @@ -167,7 +175,7 @@ pub fn apply_documents_deletion( .into_inner() .and_then(fst::Set::from_bytes) .unwrap() - }, + } None => fst::Set::default(), }; diff --git a/meilidb-core/src/update/mod.rs b/meilidb-core/src/update/mod.rs index bf3ef831f..ea93413f1 100644 --- a/meilidb-core/src/update/mod.rs +++ b/meilidb-core/src/update/mod.rs @@ -6,21 +6,21 @@ mod synonyms_addition; mod synonyms_deletion; pub use self::customs_update::{apply_customs_update, push_customs_update}; -pub use self::documents_addition::{DocumentsAddition, apply_documents_addition}; -pub use self::documents_deletion::{DocumentsDeletion, apply_documents_deletion}; +pub use self::documents_addition::{apply_documents_addition, DocumentsAddition}; +pub use self::documents_deletion::{apply_documents_deletion, DocumentsDeletion}; pub use self::schema_update::{apply_schema_update, push_schema_update}; -pub use self::synonyms_addition::{SynonymsAddition, apply_synonyms_addition}; -pub use self::synonyms_deletion::{SynonymsDeletion, apply_synonyms_deletion}; +pub use self::synonyms_addition::{apply_synonyms_addition, SynonymsAddition}; +pub use self::synonyms_deletion::{apply_synonyms_deletion, SynonymsDeletion}; -use std::time::{Duration, Instant}; -use std::collections::BTreeMap; use std::cmp; +use std::collections::BTreeMap; +use std::time::{Duration, Instant}; use log::debug; -use serde::{Serialize, Deserialize}; +use serde::{Deserialize, Serialize}; use zlmdb::Result as ZResult; -use crate::{store, MResult, DocumentId, RankedMap}; +use crate::{store, DocumentId, MResult, RankedMap}; use meilidb_schema::Schema; #[derive(Debug, Clone, Serialize, Deserialize)] @@ -68,8 +68,7 @@ pub fn update_status( updates_store: store::Updates, updates_results_store: store::UpdatesResults, update_id: u64, -) -> MResult -{ +) -> MResult { match updates_results_store.update_result(reader, update_id)? { Some(result) => Ok(UpdateStatus::Processed(result)), None => { @@ -86,8 +85,7 @@ pub fn next_update_id( writer: &mut zlmdb::RwTxn, updates_store: store::Updates, updates_results_store: store::UpdatesResults, -) -> ZResult -{ +) -> ZResult { let last_update_id = updates_store.last_update_id(writer)?; let last_update_id = last_update_id.map(|(n, _)| n); @@ -100,7 +98,10 @@ pub fn next_update_id( Ok(new_update_id) } -pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult> { +pub fn update_task( + writer: &mut zlmdb::RwTxn, + index: store::Index, +) -> MResult> { let (update_id, update) = match index.updates.pop_front(writer)? { Some(value) => value, None => return Ok(None), @@ -112,11 +113,13 @@ pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult { let start = Instant::now(); - let update_type = UpdateType::Schema { schema: schema.clone() }; + let update_type = UpdateType::Schema { + schema: schema.clone(), + }; let result = apply_schema_update(writer, index.main, &schema); (update_type, result, start.elapsed()) - }, + } Update::Customs(customs) => { let start = Instant::now(); @@ -133,7 +136,9 @@ pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult RankedMap::default(), }; - let update_type = UpdateType::DocumentsAddition { number: documents.len() }; + let update_type = UpdateType::DocumentsAddition { + number: documents.len(), + }; let result = apply_documents_addition( writer, @@ -147,7 +152,7 @@ pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult { let start = Instant::now(); @@ -156,7 +161,9 @@ pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult RankedMap::default(), }; - let update_type = UpdateType::DocumentsDeletion { number: documents.len() }; + let update_type = UpdateType::DocumentsDeletion { + number: documents.len(), + }; let result = apply_documents_deletion( writer, @@ -170,38 +177,35 @@ pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult { let start = Instant::now(); - let update_type = UpdateType::SynonymsAddition { number: synonyms.len() }; + let update_type = UpdateType::SynonymsAddition { + number: synonyms.len(), + }; - let result = apply_synonyms_addition( - writer, - index.main, - index.synonyms, - synonyms, - ); + let result = apply_synonyms_addition(writer, index.main, index.synonyms, synonyms); (update_type, result, start.elapsed()) - }, + } Update::SynonymsDeletion(synonyms) => { let start = Instant::now(); - let update_type = UpdateType::SynonymsDeletion { number: synonyms.len() }; + let update_type = UpdateType::SynonymsDeletion { + number: synonyms.len(), + }; - let result = apply_synonyms_deletion( - writer, - index.main, - index.synonyms, - synonyms, - ); + let result = apply_synonyms_deletion(writer, index.main, index.synonyms, synonyms); (update_type, result, start.elapsed()) - }, + } }; - debug!("Processed update number {} {:?} {:?}", update_id, update_type, result); + debug!( + "Processed update number {} {:?} {:?}", + update_id, update_type, result + ); let detailed_duration = DetailedDuration { main: duration }; let status = UpdateResult { @@ -211,7 +215,9 @@ pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult MResult<()> -{ - if let Some(_) = main_store.schema(writer)? { - return Err(UnsupportedOperation::SchemaAlreadyExists.into()) +) -> MResult<()> { + if main_store.schema(writer)?.is_some() { + return Err(UnsupportedOperation::SchemaAlreadyExists.into()); } - main_store.put_schema(writer, new_schema).map_err(Into::into) + main_store + .put_schema(writer, new_schema) + .map_err(Into::into) } pub fn push_schema_update( @@ -20,8 +21,7 @@ pub fn push_schema_update( updates_store: store::Updates, updates_results_store: store::UpdatesResults, schema: Schema, -) -> MResult -{ +) -> MResult { let last_update_id = next_update_id(writer, updates_store, updates_results_store)?; let update = Update::Schema(schema); diff --git a/meilidb-core/src/update/synonyms_addition.rs b/meilidb-core/src/update/synonyms_addition.rs index d6219eac6..f32475a1e 100644 --- a/meilidb-core/src/update/synonyms_addition.rs +++ b/meilidb-core/src/update/synonyms_addition.rs @@ -1,10 +1,10 @@ use std::collections::BTreeMap; -use fst::{SetBuilder, set::OpBuilder}; +use fst::{set::OpBuilder, SetBuilder}; use sdset::SetBuf; use crate::automaton::normalize_str; -use crate::update::{Update, next_update_id}; +use crate::update::{next_update_id, Update}; use crate::{store, MResult}; pub struct SynonymsAddition { @@ -19,8 +19,7 @@ impl SynonymsAddition { updates_store: store::Updates, updates_results_store: store::UpdatesResults, updates_notifier: crossbeam_channel::Sender<()>, - ) -> SynonymsAddition - { + ) -> SynonymsAddition { SynonymsAddition { updates_store, updates_results_store, @@ -30,13 +29,17 @@ impl SynonymsAddition { } pub fn add_synonym(&mut self, synonym: S, alternatives: I) - where S: AsRef, - T: AsRef, - I: IntoIterator, + where + S: AsRef, + T: AsRef, + I: IntoIterator, { let synonym = normalize_str(synonym.as_ref()); let alternatives = alternatives.into_iter().map(|s| s.as_ref().to_lowercase()); - self.synonyms.entry(synonym).or_insert_with(Vec::new).extend(alternatives); + self.synonyms + .entry(synonym) + .or_insert_with(Vec::new) + .extend(alternatives); } pub fn finalize(self, writer: &mut zlmdb::RwTxn) -> MResult { @@ -56,8 +59,7 @@ pub fn push_synonyms_addition( updates_store: store::Updates, updates_results_store: store::UpdatesResults, addition: BTreeMap>, -) -> MResult -{ +) -> MResult { let last_update_id = next_update_id(writer, updates_store, updates_results_store)?; let update = Update::SynonymsAddition(addition); @@ -71,8 +73,7 @@ pub fn apply_synonyms_addition( main_store: store::Main, synonyms_store: store::Synonyms, addition: BTreeMap>, -) -> MResult<()> -{ +) -> MResult<()> { let mut synonyms_builder = SetBuilder::memory(); for (word, alternatives) in addition { @@ -107,7 +108,7 @@ pub fn apply_synonyms_addition( .into_inner() .and_then(fst::Set::from_bytes) .unwrap() - }, + } None => delta_synonyms, }; diff --git a/meilidb-core/src/update/synonyms_deletion.rs b/meilidb-core/src/update/synonyms_deletion.rs index 43fc848ee..c498c3ab0 100644 --- a/meilidb-core/src/update/synonyms_deletion.rs +++ b/meilidb-core/src/update/synonyms_deletion.rs @@ -1,11 +1,11 @@ use std::collections::BTreeMap; use std::iter::FromIterator; -use fst::{SetBuilder, set::OpBuilder}; +use fst::{set::OpBuilder, SetBuilder}; use sdset::SetBuf; use crate::automaton::normalize_str; -use crate::update::{Update, next_update_id}; +use crate::update::{next_update_id, Update}; use crate::{store, MResult}; pub struct SynonymsDeletion { @@ -20,8 +20,7 @@ impl SynonymsDeletion { updates_store: store::Updates, updates_results_store: store::UpdatesResults, updates_notifier: crossbeam_channel::Sender<()>, - ) -> SynonymsDeletion - { + ) -> SynonymsDeletion { SynonymsDeletion { updates_store, updates_results_store, @@ -36,9 +35,10 @@ impl SynonymsDeletion { } pub fn delete_specific_alternatives_of(&mut self, synonym: S, alternatives: I) - where S: AsRef, - T: AsRef, - I: Iterator, + where + S: AsRef, + T: AsRef, + I: Iterator, { let synonym = normalize_str(synonym.as_ref()); let value = self.synonyms.entry(synonym).or_insert(None); @@ -66,8 +66,7 @@ pub fn push_synonyms_deletion( updates_store: store::Updates, updates_results_store: store::UpdatesResults, deletion: BTreeMap>>, -) -> MResult -{ +) -> MResult { let last_update_id = next_update_id(writer, updates_store, updates_results_store)?; let update = Update::SynonymsDeletion(deletion); @@ -81,8 +80,7 @@ pub fn apply_synonyms_deletion( main_store: store::Main, synonyms_store: store::Synonyms, deletion: BTreeMap>>, -) -> MResult<()> -{ +) -> MResult<()> { let mut delete_whole_synonym_builder = SetBuilder::memory(); for (synonym, alternatives) in deletion { @@ -98,9 +96,7 @@ pub fn apply_synonyms_deletion( let alternatives = SetBuf::from_dirty(alternatives); let mut builder = SetBuilder::memory(); builder.extend_iter(alternatives).unwrap(); - builder.into_inner() - .and_then(fst::Set::from_bytes) - .unwrap() + builder.into_inner().and_then(fst::Set::from_bytes).unwrap() }; let op = OpBuilder::new() @@ -124,7 +120,7 @@ pub fn apply_synonyms_deletion( } else { synonyms_store.put_synonyms(writer, synonym.as_bytes(), &alternatives)?; } - }, + } None => { delete_whole_synonym_builder.insert(&synonym).unwrap(); synonyms_store.del_synonyms(writer, synonym.as_bytes())?; @@ -150,7 +146,7 @@ pub fn apply_synonyms_deletion( .into_inner() .and_then(fst::Set::from_bytes) .unwrap() - }, + } None => fst::Set::default(), }; diff --git a/meilidb-schema/src/lib.rs b/meilidb-schema/src/lib.rs index 5109b33e1..0a32a8f2d 100644 --- a/meilidb-schema/src/lib.rs +++ b/meilidb-schema/src/lib.rs @@ -1,14 +1,26 @@ -use std::collections::{HashMap, BTreeMap}; -use std::{fmt, u16}; +use std::collections::{BTreeMap, HashMap}; use std::ops::BitOr; use std::sync::Arc; +use std::{fmt, u16}; -use serde::{Serialize, Deserialize}; use indexmap::IndexMap; +use serde::{Deserialize, Serialize}; -pub const DISPLAYED: SchemaProps = SchemaProps { displayed: true, indexed: false, ranked: false }; -pub const INDEXED: SchemaProps = SchemaProps { displayed: false, indexed: true, ranked: false }; -pub const RANKED: SchemaProps = SchemaProps { displayed: false, indexed: false, ranked: true }; +pub const DISPLAYED: SchemaProps = SchemaProps { + displayed: true, + indexed: false, + ranked: false, +}; +pub const INDEXED: SchemaProps = SchemaProps { + displayed: false, + indexed: true, + ranked: false, +}; +pub const RANKED: SchemaProps = SchemaProps { + displayed: false, + indexed: false, + ranked: true, +}; #[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct SchemaProps { @@ -80,7 +92,13 @@ impl SchemaBuilder { } let identifier = self.identifier; - Schema { inner: Arc::new(InnerSchema { identifier, attrs, props }) } + Schema { + inner: Arc::new(InnerSchema { + identifier, + attrs, + props, + }), + } } } @@ -100,7 +118,10 @@ impl Schema { fn to_builder(&self) -> SchemaBuilder { let identifier = self.inner.identifier.clone(); let attributes = self.attributes_ordered(); - SchemaBuilder { identifier, attributes } + SchemaBuilder { + identifier, + attributes, + } } fn attributes_ordered(&self) -> IndexMap { @@ -136,18 +157,18 @@ impl Schema { name } - pub fn iter<'a>(&'a self) -> impl Iterator + 'a { - self.inner.props.iter() - .map(move |(name, prop)| { - let attr = self.inner.attrs.get(name).unwrap(); - (name.as_str(), *attr, *prop) - }) + pub fn iter<'a>(&'a self) -> impl Iterator + 'a { + self.inner.props.iter().map(move |(name, prop)| { + let attr = self.inner.attrs.get(name).unwrap(); + (name.as_str(), *attr, *prop) + }) } } impl Serialize for Schema { fn serialize(&self, serializer: S) -> Result - where S: serde::ser::Serializer, + where + S: serde::ser::Serializer, { self.to_builder().serialize(serializer) } @@ -155,15 +176,15 @@ impl Serialize for Schema { impl<'de> Deserialize<'de> for Schema { fn deserialize(deserializer: D) -> Result - where D: serde::de::Deserializer<'de>, + where + D: serde::de::Deserializer<'de>, { let builder = SchemaBuilder::deserialize(deserializer)?; Ok(builder.build()) } } -#[derive(Serialize, Deserialize)] -#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)] +#[derive(Serialize, Deserialize, Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)] pub struct SchemaAttr(pub u16); impl SchemaAttr { diff --git a/meilidb-tokenizer/src/lib.rs b/meilidb-tokenizer/src/lib.rs index 3cea72ffc..106d0f91f 100644 --- a/meilidb-tokenizer/src/lib.rs +++ b/meilidb-tokenizer/src/lib.rs @@ -1,17 +1,17 @@ -use std::iter::Peekable; -use slice_group_by::StrGroupBy; use self::SeparatorCategory::*; +use slice_group_by::StrGroupBy; +use std::iter::Peekable; pub fn is_cjk(c: char) -> bool { - (c >= '\u{2e80}' && c <= '\u{2eff}') || - (c >= '\u{2f00}' && c <= '\u{2fdf}') || - (c >= '\u{3040}' && c <= '\u{309f}') || - (c >= '\u{30a0}' && c <= '\u{30ff}') || - (c >= '\u{3100}' && c <= '\u{312f}') || - (c >= '\u{3200}' && c <= '\u{32ff}') || - (c >= '\u{3400}' && c <= '\u{4dbf}') || - (c >= '\u{4e00}' && c <= '\u{9fff}') || - (c >= '\u{f900}' && c <= '\u{faff}') + (c >= '\u{2e80}' && c <= '\u{2eff}') + || (c >= '\u{2f00}' && c <= '\u{2fdf}') + || (c >= '\u{3040}' && c <= '\u{309f}') + || (c >= '\u{30a0}' && c <= '\u{30ff}') + || (c >= '\u{3100}' && c <= '\u{312f}') + || (c >= '\u{3200}' && c <= '\u{32ff}') + || (c >= '\u{3400}' && c <= '\u{4dbf}') + || (c >= '\u{4e00}' && c <= '\u{9fff}') + || (c >= '\u{f900}' && c <= '\u{faff}') } #[derive(Debug, Copy, Clone, PartialEq, Eq)] @@ -22,7 +22,11 @@ enum SeparatorCategory { impl SeparatorCategory { fn merge(self, other: SeparatorCategory) -> SeparatorCategory { - if let (Soft, Soft) = (self, other) { Soft } else { Hard } + if let (Soft, Soft) = (self, other) { + Soft + } else { + Hard + } } fn to_usize(self) -> usize { @@ -40,7 +44,7 @@ fn is_separator(c: char) -> bool { fn classify_separator(c: char) -> Option { match c { ' ' | '-' | '_' | '\'' | ':' | '"' => Some(Soft), - '.' | ';' | ',' | '!' | '?' | '(' | ')' => Some(Hard), + '.' | ';' | ',' | '!' | '?' | '(' | ')' => Some(Hard), _ => None, } } @@ -79,7 +83,7 @@ fn chars_count_index((n, _): (usize, usize), (i, c): (usize, char)) -> (usize, u (n + 1, i + c.len_utf8()) } -pub fn split_query_string(query: &str) -> impl Iterator { +pub fn split_query_string(query: &str) -> impl Iterator { Tokenizer::new(query).map(|t| t.word) } @@ -100,9 +104,10 @@ impl<'a> Tokenizer<'a> { pub fn new(string: &str) -> Tokenizer { // skip every separator and set `char_index` // to the number of char trimmed - let (count, index) = string.char_indices() - .take_while(|(_, c)| is_separator(*c)) - .fold((0, 0), chars_count_index); + let (count, index) = string + .char_indices() + .take_while(|(_, c)| is_separator(*c)) + .fold((0, 0), chars_count_index); Tokenizer { inner: &string[index..], @@ -122,10 +127,11 @@ impl<'a> Iterator for Tokenizer<'a> { let (count, index) = string.char_indices().fold((0, 0), chars_count_index); if !is_str_word(string) { - self.word_index += string.chars() - .filter_map(classify_separator) - .fold(Soft, |a, x| a.merge(x)) - .to_usize(); + self.word_index += string + .chars() + .filter_map(classify_separator) + .fold(Soft, |a, x| a.merge(x)) + .to_usize(); self.char_index += count; self.inner = &self.inner[index..]; continue; @@ -153,7 +159,8 @@ impl<'a> Iterator for Tokenizer<'a> { } pub struct SeqTokenizer<'a, I> -where I: Iterator, +where + I: Iterator, { inner: I, current: Option>>, @@ -162,13 +169,14 @@ where I: Iterator, } impl<'a, I> SeqTokenizer<'a, I> -where I: Iterator, +where + I: Iterator, { pub fn new(mut iter: I) -> SeqTokenizer<'a, I> { let current = iter.next().map(|s| Tokenizer::new(s).peekable()); SeqTokenizer { inner: iter, - current: current, + current, word_offset: 0, char_offset: 0, } @@ -176,7 +184,8 @@ where I: Iterator, } impl<'a, I> Iterator for SeqTokenizer<'a, I> -where I: Iterator, +where + I: Iterator, { type Item = Token<'a>; @@ -202,15 +211,15 @@ where I: Iterator, } Some(token) - }, + } None => { // no more words in this text we must // start tokenizing the next text self.current = self.inner.next().map(|s| Tokenizer::new(s).peekable()); self.next() - }, + } } - }, + } // no more texts available None => None, } @@ -225,12 +234,26 @@ mod tests { fn easy() { let mut tokenizer = Tokenizer::new("salut"); - assert_eq!(tokenizer.next(), Some(Token { word: "salut", word_index: 0, char_index: 0 })); + assert_eq!( + tokenizer.next(), + Some(Token { + word: "salut", + word_index: 0, + char_index: 0 + }) + ); assert_eq!(tokenizer.next(), None); let mut tokenizer = Tokenizer::new("yo "); - assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 })); + assert_eq!( + tokenizer.next(), + Some(Token { + word: "yo", + word_index: 0, + char_index: 0 + }) + ); assert_eq!(tokenizer.next(), None); } @@ -238,19 +261,82 @@ mod tests { fn hard() { let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe (ouch)"); - assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 })); - assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 })); - assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 })); - assert_eq!(tokenizer.next(), Some(Token { word: "ouch", word_index: 17, char_index: 18 })); + assert_eq!( + tokenizer.next(), + Some(Token { + word: "yo", + word_index: 0, + char_index: 4 + }) + ); + assert_eq!( + tokenizer.next(), + Some(Token { + word: "lolo", + word_index: 1, + char_index: 7 + }) + ); + assert_eq!( + tokenizer.next(), + Some(Token { + word: "aïe", + word_index: 9, + char_index: 13 + }) + ); + assert_eq!( + tokenizer.next(), + Some(Token { + word: "ouch", + word_index: 17, + char_index: 18 + }) + ); assert_eq!(tokenizer.next(), None); let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,"); - assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 })); - assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 })); - assert_eq!(tokenizer.next(), Some(Token { word: "wtf", word_index: 16, char_index: 12 })); - assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 17, char_index: 18 })); - assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 25, char_index: 24 })); + assert_eq!( + tokenizer.next(), + Some(Token { + word: "yo", + word_index: 0, + char_index: 0 + }) + ); + assert_eq!( + tokenizer.next(), + Some(Token { + word: "lolo", + word_index: 8, + char_index: 5 + }) + ); + assert_eq!( + tokenizer.next(), + Some(Token { + word: "wtf", + word_index: 16, + char_index: 12 + }) + ); + assert_eq!( + tokenizer.next(), + Some(Token { + word: "lol", + word_index: 17, + char_index: 18 + }) + ); + assert_eq!( + tokenizer.next(), + Some(Token { + word: "aïe", + word_index: 25, + char_index: 24 + }) + ); assert_eq!(tokenizer.next(), None); } @@ -258,18 +344,74 @@ mod tests { fn hard_long_chars() { let mut tokenizer = Tokenizer::new(" .? yo 😂. aïe"); - assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 })); - assert_eq!(tokenizer.next(), Some(Token { word: "😂", word_index: 1, char_index: 7 })); - assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 10 })); + assert_eq!( + tokenizer.next(), + Some(Token { + word: "yo", + word_index: 0, + char_index: 4 + }) + ); + assert_eq!( + tokenizer.next(), + Some(Token { + word: "😂", + word_index: 1, + char_index: 7 + }) + ); + assert_eq!( + tokenizer.next(), + Some(Token { + word: "aïe", + word_index: 9, + char_index: 10 + }) + ); assert_eq!(tokenizer.next(), None); let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,"); - assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 })); - assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 })); - assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 })); - assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 17, char_index: 16 })); - assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 25, char_index: 22 })); + assert_eq!( + tokenizer.next(), + Some(Token { + word: "yo", + word_index: 0, + char_index: 0 + }) + ); + assert_eq!( + tokenizer.next(), + Some(Token { + word: "lolo", + word_index: 8, + char_index: 5 + }) + ); + assert_eq!( + tokenizer.next(), + Some(Token { + word: "😱", + word_index: 16, + char_index: 12 + }) + ); + assert_eq!( + tokenizer.next(), + Some(Token { + word: "lol", + word_index: 17, + char_index: 16 + }) + ); + assert_eq!( + tokenizer.next(), + Some(Token { + word: "😣", + word_index: 25, + char_index: 22 + }) + ); assert_eq!(tokenizer.next(), None); } @@ -277,19 +419,82 @@ mod tests { fn hard_kanjis() { let mut tokenizer = Tokenizer::new("\u{2ec4}lolilol\u{2ec7}"); - assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 })); - assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 1, char_index: 1 })); - assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 2, char_index: 8 })); + assert_eq!( + tokenizer.next(), + Some(Token { + word: "\u{2ec4}", + word_index: 0, + char_index: 0 + }) + ); + assert_eq!( + tokenizer.next(), + Some(Token { + word: "lolilol", + word_index: 1, + char_index: 1 + }) + ); + assert_eq!( + tokenizer.next(), + Some(Token { + word: "\u{2ec7}", + word_index: 2, + char_index: 8 + }) + ); assert_eq!(tokenizer.next(), None); let mut tokenizer = Tokenizer::new("\u{2ec4}\u{2ed3}\u{2ef2} lolilol - hello \u{2ec7}"); - assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 })); - assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ed3}", word_index: 1, char_index: 1 })); - assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ef2}", word_index: 2, char_index: 2 })); - assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 3, char_index: 4 })); - assert_eq!(tokenizer.next(), Some(Token { word: "hello", word_index: 4, char_index: 14 })); - assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 5, char_index: 23 })); + assert_eq!( + tokenizer.next(), + Some(Token { + word: "\u{2ec4}", + word_index: 0, + char_index: 0 + }) + ); + assert_eq!( + tokenizer.next(), + Some(Token { + word: "\u{2ed3}", + word_index: 1, + char_index: 1 + }) + ); + assert_eq!( + tokenizer.next(), + Some(Token { + word: "\u{2ef2}", + word_index: 2, + char_index: 2 + }) + ); + assert_eq!( + tokenizer.next(), + Some(Token { + word: "lolilol", + word_index: 3, + char_index: 4 + }) + ); + assert_eq!( + tokenizer.next(), + Some(Token { + word: "hello", + word_index: 4, + char_index: 14 + }) + ); + assert_eq!( + tokenizer.next(), + Some(Token { + word: "\u{2ec7}", + word_index: 5, + char_index: 23 + }) + ); assert_eq!(tokenizer.next(), None); } }