Merge pull request #229 from meilisearch/cargo-fmt-clippy

Cargo pass of fmt and clippy
This commit is contained in:
Clément Renault 2019-10-18 13:50:30 +02:00 committed by GitHub
commit 6c9a238973
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
49 changed files with 1683 additions and 1042 deletions

View File

@ -13,13 +13,17 @@ jobs:
steps: steps:
- script: | - script: |
curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain nightly curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain nightly
displayName: 'Install rustc' $HOME/.cargo/bin/rustup component add rustfmt
displayName: 'Install rustc and components'
- script: | - script: |
$HOME/.cargo/bin/cargo check $HOME/.cargo/bin/cargo check
displayName: 'Check MeiliDB' displayName: 'Check MeiliDB'
- script: | - script: |
$HOME/.cargo/bin/cargo test $HOME/.cargo/bin/cargo test
displayName: 'Test MeiliDB' displayName: 'Test MeiliDB'
- script: |
$HOME/.cargo/bin/cargo fmt --all -- --check
displayName: 'Fmt MeiliDB'
- job: build - job: build
dependsOn: dependsOn:
@ -31,7 +35,8 @@ jobs:
steps: steps:
- script: | - script: |
curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain nightly curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain nightly
displayName: 'Install rustc' $HOME/.cargo/bin/rustup component add rustfmt
displayName: 'Install rustc and components'
- script: | - script: |
$HOME/.cargo/bin/cargo build --release $HOME/.cargo/bin/cargo build --release
displayName: 'Build MeiliDB' displayName: 'Build MeiliDB'

View File

@ -4,15 +4,15 @@ use std::error::Error;
use std::io::Write; use std::io::Write;
use std::iter::FromIterator; use std::iter::FromIterator;
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::time::{Instant, Duration}; use std::time::{Duration, Instant};
use std::{fs, io, sync::mpsc}; use std::{fs, io, sync::mpsc};
use rustyline::{Editor, Config}; use rustyline::{Config, Editor};
use serde::{Serialize, Deserialize}; use serde::{Deserialize, Serialize};
use structopt::StructOpt; use structopt::StructOpt;
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor}; use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
use meilidb_core::{Highlight, Database, UpdateResult}; use meilidb_core::{Database, Highlight, UpdateResult};
use meilidb_schema::SchemaAttr; use meilidb_schema::SchemaAttr;
const INDEX_NAME: &str = "default"; const INDEX_NAME: &str = "default";
@ -91,7 +91,7 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dy
let update_fn = move |update: UpdateResult| sender.send(update.update_id).unwrap(); let update_fn = move |update: UpdateResult| sender.send(update.update_id).unwrap();
let index = match database.open_index(INDEX_NAME) { let index = match database.open_index(INDEX_NAME) {
Some(index) => index, Some(index) => index,
None => database.create_index(INDEX_NAME).unwrap() None => database.create_index(INDEX_NAME).unwrap(),
}; };
let done = database.set_update_callback(INDEX_NAME, Box::new(update_fn)); let done = database.set_update_callback(INDEX_NAME, Box::new(update_fn));
@ -108,14 +108,14 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dy
match index.main.schema(&writer)? { match index.main.schema(&writer)? {
Some(current_schema) => { Some(current_schema) => {
if current_schema != schema { if current_schema != schema {
return Err(meilidb_core::Error::SchemaDiffer.into()) return Err(meilidb_core::Error::SchemaDiffer.into());
} }
writer.abort(); writer.abort();
}, }
None => { None => {
index.schema_update(&mut writer, schema)?; index.schema_update(&mut writer, schema)?;
writer.commit().unwrap(); writer.commit().unwrap();
}, }
} }
let mut rdr = csv::Reader::from_path(command.csv_data_path)?; let mut rdr = csv::Reader::from_path(command.csv_data_path)?;
@ -131,7 +131,9 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dy
loop { loop {
end_of_file = !rdr.read_record(&mut raw_record)?; end_of_file = !rdr.read_record(&mut raw_record)?;
if end_of_file { break } if end_of_file {
break;
}
let document: Document = match raw_record.deserialize(Some(&headers)) { let document: Document = match raw_record.deserialize(Some(&headers)) {
Ok(document) => document, Ok(document) => document,
@ -147,7 +149,9 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dy
i += 1; i += 1;
if let Some(group_size) = command.update_group_size { if let Some(group_size) = command.update_group_size {
if i % group_size == 0 { break } if i % group_size == 0 {
break;
}
} }
} }
@ -163,15 +167,25 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dy
println!("Waiting for update {}", max_update_id); println!("Waiting for update {}", max_update_id);
for id in receiver { for id in receiver {
if id == max_update_id { break } if id == max_update_id {
break;
}
} }
println!("database created in {:.2?} at: {:?}", start.elapsed(), command.database_path); println!(
"database created in {:.2?} at: {:?}",
start.elapsed(),
command.database_path
);
if let Some(path) = command.compact_to_path { if let Some(path) = command.compact_to_path {
let start = Instant::now(); let start = Instant::now();
let _file = database.copy_and_compact_to_path(&path)?; let _file = database.copy_and_compact_to_path(&path)?;
println!("database compacted in {:.2?} at: {:?}", start.elapsed(), path); println!(
"database compacted in {:.2?} at: {:?}",
start.elapsed(),
path
);
} }
Ok(()) Ok(())
@ -182,7 +196,10 @@ fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> {
let mut highlighted = false; let mut highlighted = false;
for range in ranges.windows(2) { for range in ranges.windows(2) {
let [start, end] = match range { [start, end] => [*start, *end], _ => unreachable!() }; let [start, end] = match range {
[start, end] => [*start, *end],
_ => unreachable!(),
};
if highlighted { if highlighted {
stdout.set_color(ColorSpec::new().set_fg(Some(Color::Yellow)))?; stdout.set_color(ColorSpec::new().set_fg(Some(Color::Yellow)))?;
} }
@ -221,12 +238,14 @@ fn create_highlight_areas(text: &str, highlights: &[Highlight]) -> Vec<usize> {
let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text); let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text);
match byte_indexes.entry(byte_index) { match byte_indexes.entry(byte_index) {
Entry::Vacant(entry) => { entry.insert(byte_length); }, Entry::Vacant(entry) => {
entry.insert(byte_length);
}
Entry::Occupied(mut entry) => { Entry::Occupied(mut entry) => {
if *entry.get() < byte_length { if *entry.get() < byte_length {
entry.insert(byte_length); entry.insert(byte_length);
} }
}, }
} }
} }
@ -252,22 +271,23 @@ fn create_highlight_areas(text: &str, highlights: &[Highlight]) -> Vec<usize> {
/// ``` /// ```
fn crop_text( fn crop_text(
text: &str, text: &str,
highlights: impl IntoIterator<Item=Highlight>, highlights: impl IntoIterator<Item = Highlight>,
context: usize, context: usize,
) -> (String, Vec<Highlight>) ) -> (String, Vec<Highlight>) {
{
let mut highlights = highlights.into_iter().peekable(); let mut highlights = highlights.into_iter().peekable();
let char_index = highlights.peek().map(|m| m.char_index as usize).unwrap_or(0); let char_index = highlights
.peek()
.map(|m| m.char_index as usize)
.unwrap_or(0);
let start = char_index.saturating_sub(context); let start = char_index.saturating_sub(context);
let text = text.chars().skip(start).take(context * 2).collect(); let text = text.chars().skip(start).take(context * 2).collect();
let highlights = highlights let highlights = highlights
.take_while(|m| { .take_while(|m| (m.char_index as usize) + (m.char_length as usize) <= start + (context * 2))
(m.char_index as usize) + (m.char_length as usize) <= start + (context * 2) .map(|highlight| Highlight {
}) char_index: highlight.char_index - start as u16,
.map(|highlight| { ..highlight
Highlight { char_index: highlight.char_index - start as u16, ..highlight }
}) })
.collect(); .collect();
@ -276,7 +296,9 @@ fn crop_text(
fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<dyn Error>> { fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<dyn Error>> {
let env = &database.env; let env = &database.env;
let index = database.open_index(INDEX_NAME).expect("Could not find index"); let index = database
.open_index(INDEX_NAME)
.expect("Could not find index");
let reader = env.read_txn().unwrap(); let reader = env.read_txn().unwrap();
let schema = index.main.schema(&reader)?; let schema = index.main.schema(&reader)?;
@ -312,10 +334,15 @@ fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<
(true, filter) (true, filter)
}; };
let attr = schema.attribute(&filter).expect("Could not find filtered attribute"); let attr = schema
.attribute(&filter)
.expect("Could not find filtered attribute");
builder.with_filter(move |document_id| { builder.with_filter(move |document_id| {
let string: String = ref_index.document_attribute(ref_reader, document_id, attr).unwrap().unwrap(); let string: String = ref_index
.document_attribute(ref_reader, document_id, attr)
.unwrap()
.unwrap();
(string == "true") == positive (string == "true") == positive
}); });
} }
@ -326,8 +353,8 @@ fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<
let number_of_documents = documents.len(); let number_of_documents = documents.len();
for mut doc in documents { for mut doc in documents {
doc.highlights
doc.highlights.sort_unstable_by_key(|m| (m.char_index, m.char_length)); .sort_unstable_by_key(|m| (m.char_index, m.char_length));
let start_retrieve = Instant::now(); let start_retrieve = Instant::now();
let result = index.document::<Document>(&reader, Some(&fields), doc.id); let result = index.document::<Document>(&reader, Some(&fields), doc.id);
@ -340,15 +367,18 @@ fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<
print!("{}: ", name); print!("{}: ", name);
let attr = schema.attribute(&name).unwrap(); let attr = schema.attribute(&name).unwrap();
let highlights = doc.highlights.iter() let highlights = doc
.filter(|m| SchemaAttr::new(m.attribute) == attr) .highlights
.cloned(); .iter()
let (text, highlights) = crop_text(&text, highlights, command.char_context); .filter(|m| SchemaAttr::new(m.attribute) == attr)
.cloned();
let (text, highlights) =
crop_text(&text, highlights, command.char_context);
let areas = create_highlight_areas(&text, &highlights); let areas = create_highlight_areas(&text, &highlights);
display_highlights(&text, &areas)?; display_highlights(&text, &areas)?;
println!(); println!();
} }
}, }
Ok(None) => eprintln!("missing document"), Ok(None) => eprintln!("missing document"),
Err(e) => eprintln!("{}", e), Err(e) => eprintln!("{}", e),
} }
@ -366,12 +396,19 @@ fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<
println!(); println!();
} }
eprintln!("whole documents fields retrieve took {:.2?}", retrieve_duration); eprintln!(
eprintln!("===== Found {} results in {:.2?} =====", number_of_documents, start_total.elapsed()); "whole documents fields retrieve took {:.2?}",
}, retrieve_duration
);
eprintln!(
"===== Found {} results in {:.2?} =====",
number_of_documents,
start_total.elapsed()
);
}
Err(err) => { Err(err) => {
println!("Error: {:?}", err); println!("Error: {:?}", err);
break break;
} }
} }
} }

View File

@ -1,8 +1,5 @@
use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA};
use once_cell::sync::OnceCell; use once_cell::sync::OnceCell;
use levenshtein_automata::{
LevenshteinAutomatonBuilder as LevBuilder,
DFA,
};
static LEVDIST0: OnceCell<LevBuilder> = OnceCell::new(); static LEVDIST0: OnceCell<LevBuilder> = OnceCell::new();
static LEVDIST1: OnceCell<LevBuilder> = OnceCell::new(); static LEVDIST1: OnceCell<LevBuilder> = OnceCell::new();
@ -15,30 +12,30 @@ enum PrefixSetting {
} }
fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DFA { fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DFA {
use PrefixSetting::{Prefix, NoPrefix}; use PrefixSetting::{NoPrefix, Prefix};
match query.len() { match query.len() {
0 ..= 4 => { 0..=4 => {
let builder = LEVDIST0.get_or_init(|| LevBuilder::new(0, true)); let builder = LEVDIST0.get_or_init(|| LevBuilder::new(0, true));
match setting { match setting {
Prefix => builder.build_prefix_dfa(query), Prefix => builder.build_prefix_dfa(query),
NoPrefix => builder.build_dfa(query), NoPrefix => builder.build_dfa(query),
} }
}, }
5 ..= 8 => { 5..=8 => {
let builder = LEVDIST1.get_or_init(|| LevBuilder::new(1, true)); let builder = LEVDIST1.get_or_init(|| LevBuilder::new(1, true));
match setting { match setting {
Prefix => builder.build_prefix_dfa(query), Prefix => builder.build_prefix_dfa(query),
NoPrefix => builder.build_dfa(query), NoPrefix => builder.build_dfa(query),
} }
}, }
_ => { _ => {
let builder = LEVDIST2.get_or_init(|| LevBuilder::new(2, true)); let builder = LEVDIST2.get_or_init(|| LevBuilder::new(2, true));
match setting { match setting {
Prefix => builder.build_prefix_dfa(query), Prefix => builder.build_prefix_dfa(query),
NoPrefix => builder.build_dfa(query), NoPrefix => builder.build_dfa(query),
} }
}, }
} }
} }

View File

@ -6,14 +6,14 @@ use std::vec;
use fst::{IntoStreamer, Streamer}; use fst::{IntoStreamer, Streamer};
use levenshtein_automata::DFA; use levenshtein_automata::DFA;
use meilidb_tokenizer::{split_query_string, is_cjk}; use meilidb_tokenizer::{is_cjk, split_query_string};
use crate::store;
use crate::error::MResult; use crate::error::MResult;
use crate::store;
use self::dfa::{build_dfa, build_prefix_dfa}; use self::dfa::{build_dfa, build_prefix_dfa};
use self::query_enhancer::QueryEnhancerBuilder;
pub use self::query_enhancer::QueryEnhancer; pub use self::query_enhancer::QueryEnhancer;
use self::query_enhancer::QueryEnhancerBuilder;
const NGRAMS: usize = 3; const NGRAMS: usize = 3;
@ -27,14 +27,9 @@ impl AutomatonProducer {
query: &str, query: &str,
main_store: store::Main, main_store: store::Main,
synonyms_store: store::Synonyms, synonyms_store: store::Synonyms,
) -> MResult<(AutomatonProducer, QueryEnhancer)> ) -> MResult<(AutomatonProducer, QueryEnhancer)> {
{ let (automatons, query_enhancer) =
let (automatons, query_enhancer) = generate_automatons( generate_automatons(reader, query, main_store, synonyms_store)?;
reader,
query,
main_store,
synonyms_store,
)?;
Ok((AutomatonProducer { automatons }, query_enhancer)) Ok((AutomatonProducer { automatons }, query_enhancer))
} }
@ -112,8 +107,7 @@ fn generate_automatons(
query: &str, query: &str,
main_store: store::Main, main_store: store::Main,
synonym_store: store::Synonyms, synonym_store: store::Synonyms,
) -> MResult<(Vec<Vec<Automaton>>, QueryEnhancer)> ) -> MResult<(Vec<Vec<Automaton>>, QueryEnhancer)> {
{
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect(); let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
let synonyms = match main_store.synonyms_fst(reader)? { let synonyms = match main_store.synonyms_fst(reader)? {
@ -130,7 +124,6 @@ fn generate_automatons(
let mut original_automatons = Vec::new(); let mut original_automatons = Vec::new();
let mut original_words = query_words.iter().peekable(); let mut original_words = query_words.iter().peekable();
while let Some(word) = original_words.next() { while let Some(word) = original_words.next() {
let has_following_word = original_words.peek().is_some(); let has_following_word = original_words.peek().is_some();
let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk); let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk);
@ -148,29 +141,33 @@ fn generate_automatons(
for n in 1..=NGRAMS { for n in 1..=NGRAMS {
let mut ngrams = query_words.windows(n).enumerate().peekable(); let mut ngrams = query_words.windows(n).enumerate().peekable();
while let Some((query_index, ngram_slice)) = ngrams.next() { while let Some((query_index, ngram_slice)) = ngrams.next() {
let query_range = query_index..query_index + n; let query_range = query_index..query_index + n;
let ngram_nb_words = ngram_slice.len(); let ngram_nb_words = ngram_slice.len();
let ngram = ngram_slice.join(" "); let ngram = ngram_slice.join(" ");
let has_following_word = ngrams.peek().is_some(); let has_following_word = ngrams.peek().is_some();
let not_prefix_dfa = has_following_word || has_end_whitespace || ngram.chars().all(is_cjk); let not_prefix_dfa =
has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
// automaton of synonyms of the ngrams // automaton of synonyms of the ngrams
let normalized = normalize_str(&ngram); let normalized = normalize_str(&ngram);
let lev = if not_prefix_dfa { build_dfa(&normalized) } else { build_prefix_dfa(&normalized) }; let lev = if not_prefix_dfa {
build_dfa(&normalized)
} else {
build_prefix_dfa(&normalized)
};
let mut stream = synonyms.search(&lev).into_stream(); let mut stream = synonyms.search(&lev).into_stream();
while let Some(base) = stream.next() { while let Some(base) = stream.next() {
// only trigger alternatives when the last word has been typed // only trigger alternatives when the last word has been typed
// i.e. "new " do not but "new yo" triggers alternatives to "new york" // i.e. "new " do not but "new yo" triggers alternatives to "new york"
let base = std::str::from_utf8(base).unwrap(); let base = std::str::from_utf8(base).unwrap();
let base_nb_words = split_query_string(base).count(); let base_nb_words = split_query_string(base).count();
if ngram_nb_words != base_nb_words { continue } if ngram_nb_words != base_nb_words {
continue;
}
if let Some(synonyms) = synonym_store.synonyms(reader, base.as_bytes())? { if let Some(synonyms) = synonym_store.synonyms(reader, base.as_bytes())? {
let mut stream = synonyms.into_stream(); let mut stream = synonyms.into_stream();
while let Some(synonyms) = stream.next() { while let Some(synonyms) = stream.next() {
let synonyms = std::str::from_utf8(synonyms).unwrap(); let synonyms = std::str::from_utf8(synonyms).unwrap();
@ -178,7 +175,11 @@ fn generate_automatons(
let nb_synonym_words = synonyms_words.len(); let nb_synonym_words = synonyms_words.len();
let real_query_index = automaton_index; let real_query_index = automaton_index;
enhancer_builder.declare(query_range.clone(), real_query_index, &synonyms_words); enhancer_builder.declare(
query_range.clone(),
real_query_index,
&synonyms_words,
);
for synonym in synonyms_words { for synonym in synonyms_words {
let automaton = if nb_synonym_words == 1 { let automaton = if nb_synonym_words == 1 {

View File

@ -1,5 +1,5 @@
use std::cmp::Ordering::{Equal, Greater, Less};
use std::ops::Range; use std::ops::Range;
use std::cmp::Ordering::{Less, Greater, Equal};
/// Return `true` if the specified range can accept the given replacements words. /// Return `true` if the specified range can accept the given replacements words.
/// Returns `false` if the replacements words are already present in the original query /// Returns `false` if the replacements words are already present in the original query
@ -34,13 +34,14 @@ use std::cmp::Ordering::{Less, Greater, Equal};
// [new york city] // [new york city]
// //
fn rewrite_range_with<S, T>(query: &[S], range: Range<usize>, words: &[T]) -> bool fn rewrite_range_with<S, T>(query: &[S], range: Range<usize>, words: &[T]) -> bool
where S: AsRef<str>, where
T: AsRef<str>, S: AsRef<str>,
T: AsRef<str>,
{ {
if words.len() <= range.len() { if words.len() <= range.len() {
// there is fewer or equal replacement words // there is fewer or equal replacement words
// than there is already in the replaced range // than there is already in the replaced range
return false return false;
} }
// retrieve the part to rewrite but with the length // retrieve the part to rewrite but with the length
@ -49,7 +50,9 @@ where S: AsRef<str>,
// check if the original query doesn't already contain // check if the original query doesn't already contain
// the replacement words // the replacement words
!original.map(AsRef::as_ref).eq(words.iter().map(AsRef::as_ref)) !original
.map(AsRef::as_ref)
.eq(words.iter().map(AsRef::as_ref))
} }
type Origin = usize; type Origin = usize;
@ -68,11 +71,20 @@ impl FakeIntervalTree {
fn query(&self, point: usize) -> Option<(Range<usize>, (Origin, RealLength))> { fn query(&self, point: usize) -> Option<(Range<usize>, (Origin, RealLength))> {
let element = self.intervals.binary_search_by(|(r, _)| { let element = self.intervals.binary_search_by(|(r, _)| {
if point >= r.start { if point >= r.start {
if point < r.end { Equal } else { Less } if point < r.end {
} else { Greater } Equal
} else {
Less
}
} else {
Greater
}
}); });
let n = match element { Ok(n) => n, Err(n) => n }; let n = match element {
Ok(n) => n,
Err(n) => n,
};
match self.intervals.get(n) { match self.intervals.get(n) {
Some((range, value)) if range.contains(&point) => Some((range.clone(), *value)), Some((range, value)) if range.contains(&point) => Some((range.clone(), *value)),
@ -90,10 +102,14 @@ pub struct QueryEnhancerBuilder<'a, S> {
impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> { impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
pub fn new(query: &[S]) -> QueryEnhancerBuilder<S> { pub fn new(query: &[S]) -> QueryEnhancerBuilder<S> {
// we initialize origins query indices based on their positions // we initialize origins query indices based on their positions
let origins: Vec<_> = (0..query.len() + 1).collect(); let origins: Vec<_> = (0..=query.len()).collect();
let real_to_origin = origins.iter().map(|&o| (o..o+1, (o, 1))).collect(); let real_to_origin = origins.iter().map(|&o| (o..o + 1, (o, 1))).collect();
QueryEnhancerBuilder { query, origins, real_to_origin } QueryEnhancerBuilder {
query,
origins,
real_to_origin,
}
} }
/// Update the final real to origin query indices mapping. /// Update the final real to origin query indices mapping.
@ -101,12 +117,12 @@ impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
/// `range` is the original words range that this `replacement` words replace /// `range` is the original words range that this `replacement` words replace
/// and `real` is the first real query index of these replacement words. /// and `real` is the first real query index of these replacement words.
pub fn declare<T>(&mut self, range: Range<usize>, real: usize, replacement: &[T]) pub fn declare<T>(&mut self, range: Range<usize>, real: usize, replacement: &[T])
where T: AsRef<str>, where
T: AsRef<str>,
{ {
// check if the range of original words // check if the range of original words
// can be rewritten with the replacement words // can be rewritten with the replacement words
if rewrite_range_with(self.query, range.clone(), replacement) { if rewrite_range_with(self.query, range.clone(), replacement) {
// this range can be replaced so we need to // this range can be replaced so we need to
// modify the origins accordingly // modify the origins accordingly
let offset = replacement.len() - range.len(); let offset = replacement.len() - range.len();
@ -126,7 +142,8 @@ impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
// we need to pad real query indices // we need to pad real query indices
let real_range = real..real + replacement.len().max(range.len()); let real_range = real..real + replacement.len().max(range.len());
let real_length = replacement.len(); let real_length = replacement.len();
self.real_to_origin.push((real_range, (range.start, real_length))); self.real_to_origin
.push((real_range, (range.start, real_length)));
} }
pub fn build(self) -> QueryEnhancer { pub fn build(self) -> QueryEnhancer {
@ -148,10 +165,10 @@ impl QueryEnhancer {
let real = real as usize; let real = real as usize;
// query the fake interval tree with the real query index // query the fake interval tree with the real query index
let (range, (origin, real_length)) = let (range, (origin, real_length)) = self
self.real_to_origin .real_to_origin
.query(real) .query(real)
.expect("real has never been declared"); .expect("real has never been declared");
// if `real` is the end bound of the range // if `real` is the end bound of the range
if (range.start + real_length - 1) == real { if (range.start + real_length - 1) == real {
@ -160,7 +177,10 @@ impl QueryEnhancer {
for (i, slice) in self.origins[new_origin..].windows(2).enumerate() { for (i, slice) in self.origins[new_origin..].windows(2).enumerate() {
let len = slice[1] - slice[0]; let len = slice[1] - slice[0];
count = count.saturating_sub(len); count = count.saturating_sub(len);
if count == 0 { new_origin = origin + i; break } if count == 0 {
new_origin = origin + i;
break;
}
} }
let n = real - range.start; let n = real - range.start;
@ -168,15 +188,20 @@ impl QueryEnhancer {
let end = self.origins[new_origin + 1]; let end = self.origins[new_origin + 1];
let remaining = (end - start) - n; let remaining = (end - start) - n;
Range { start: (start + n) as u32, end: (start + n + remaining) as u32 } Range {
start: (start + n) as u32,
end: (start + n + remaining) as u32,
}
} else { } else {
// just return the origin along with // just return the origin along with
// the real position of the word // the real position of the word
let n = real as usize - range.start; let n = real as usize - range.start;
let origin = self.origins[origin]; let origin = self.origins[origin];
Range { start: (origin + n) as u32, end: (origin + n + 1) as u32 } Range {
start: (origin + n) as u32,
end: (origin + n + 1) as u32,
}
} }
} }
} }
@ -382,16 +407,16 @@ mod tests {
let enhancer = builder.build(); let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // great assert_eq!(enhancer.replacement(0), 0..1); // great
assert_eq!(enhancer.replacement(1), 1..2); // awesome assert_eq!(enhancer.replacement(1), 1..2); // awesome
assert_eq!(enhancer.replacement(2), 2..5); // NYC assert_eq!(enhancer.replacement(2), 2..5); // NYC
assert_eq!(enhancer.replacement(3), 5..7); // subway assert_eq!(enhancer.replacement(3), 5..7); // subway
assert_eq!(enhancer.replacement(4), 2..3); // new assert_eq!(enhancer.replacement(4), 2..3); // new
assert_eq!(enhancer.replacement(5), 3..4); // york assert_eq!(enhancer.replacement(5), 3..4); // york
assert_eq!(enhancer.replacement(6), 4..5); // city assert_eq!(enhancer.replacement(6), 4..5); // city
assert_eq!(enhancer.replacement(7), 5..6); // underground assert_eq!(enhancer.replacement(7), 5..6); // underground
assert_eq!(enhancer.replacement(8), 6..7); // train assert_eq!(enhancer.replacement(8), 6..7); // train
assert_eq!(enhancer.replacement(9), 0..2); // good assert_eq!(enhancer.replacement(9), 0..2); // good
assert_eq!(enhancer.replacement(10), 1..5); // NY assert_eq!(enhancer.replacement(10), 1..5); // NY
assert_eq!(enhancer.replacement(11), 2..5); // metro assert_eq!(enhancer.replacement(11), 2..5); // metro
} }

View File

@ -1,6 +1,6 @@
use std::cmp::Ordering;
use crate::criterion::Criterion; use crate::criterion::Criterion;
use crate::RawDocument; use crate::RawDocument;
use std::cmp::Ordering;
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
pub struct DocumentId; pub struct DocumentId;

View File

@ -1,8 +1,8 @@
use std::cmp::Ordering; use std::cmp::Ordering;
use meilidb_schema::SchemaAttr;
use sdset::Set; use sdset::Set;
use slice_group_by::GroupBy; use slice_group_by::GroupBy;
use meilidb_schema::SchemaAttr;
use crate::criterion::Criterion; use crate::criterion::Criterion;
use crate::RawDocument; use crate::RawDocument;
@ -13,8 +13,7 @@ fn number_exact_matches(
attribute: &[u16], attribute: &[u16],
is_exact: &[bool], is_exact: &[bool],
fields_counts: &Set<(SchemaAttr, u64)>, fields_counts: &Set<(SchemaAttr, u64)>,
) -> usize ) -> usize {
{
let mut count = 0; let mut count = 0;
let mut index = 0; let mut index = 0;
@ -22,12 +21,16 @@ fn number_exact_matches(
let len = group.len(); let len = group.len();
let mut found_exact = false; let mut found_exact = false;
for (pos, _) in is_exact[index..index + len].iter().filter(|x| **x).enumerate() { for (pos, _) in is_exact[index..index + len]
.iter()
.filter(|x| **x)
.enumerate()
{
found_exact = true; found_exact = true;
if let Ok(pos) = fields_counts.binary_search_by_key(&attribute[pos], |(a, _)| a.0) { if let Ok(pos) = fields_counts.binary_search_by_key(&attribute[pos], |(a, _)| a.0) {
let (_, count) = fields_counts[pos]; let (_, count) = fields_counts[pos];
if count == 1 { if count == 1 {
return usize::max_value() return usize::max_value();
} }
} }
} }
@ -81,18 +84,18 @@ mod tests {
#[test] #[test]
fn easy_case() { fn easy_case() {
let doc0 = { let doc0 = {
let query_index = &[0]; let query_index = &[0];
let attribute = &[0]; let attribute = &[0];
let is_exact = &[true]; let is_exact = &[true];
let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap(); let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap();
number_exact_matches(query_index, attribute, is_exact, fields_counts) number_exact_matches(query_index, attribute, is_exact, fields_counts)
}; };
let doc1 = { let doc1 = {
let query_index = &[0]; let query_index = &[0];
let attribute = &[0]; let attribute = &[0];
let is_exact = &[false]; let is_exact = &[false];
let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap(); let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap();
number_exact_matches(query_index, attribute, is_exact, fields_counts) number_exact_matches(query_index, attribute, is_exact, fields_counts)
@ -108,18 +111,18 @@ mod tests {
#[test] #[test]
fn basic() { fn basic() {
let doc0 = { let doc0 = {
let query_index = &[0]; let query_index = &[0];
let attribute = &[0]; let attribute = &[0];
let is_exact = &[true]; let is_exact = &[true];
let fields_counts = Set::new(&[(SchemaAttr(0), 1)]).unwrap(); let fields_counts = Set::new(&[(SchemaAttr(0), 1)]).unwrap();
number_exact_matches(query_index, attribute, is_exact, fields_counts) number_exact_matches(query_index, attribute, is_exact, fields_counts)
}; };
let doc1 = { let doc1 = {
let query_index = &[0]; let query_index = &[0];
let attribute = &[0]; let attribute = &[0];
let is_exact = &[true]; let is_exact = &[true];
let fields_counts = Set::new(&[(SchemaAttr(0), 4)]).unwrap(); let fields_counts = Set::new(&[(SchemaAttr(0), 4)]).unwrap();
number_exact_matches(query_index, attribute, is_exact, fields_counts) number_exact_matches(query_index, attribute, is_exact, fields_counts)

View File

@ -1,24 +1,20 @@
mod sum_of_typos; mod document_id;
mod exact;
mod number_of_words; mod number_of_words;
mod words_proximity; mod sort_by_attr;
mod sum_of_typos;
mod sum_of_words_attribute; mod sum_of_words_attribute;
mod sum_of_words_position; mod sum_of_words_position;
mod exact; mod words_proximity;
mod sort_by_attr;
mod document_id;
use std::cmp::Ordering;
use crate::RawDocument; use crate::RawDocument;
use std::cmp::Ordering;
pub use self::{ pub use self::{
sum_of_typos::SumOfTypos, document_id::DocumentId, exact::Exact, number_of_words::NumberOfWords,
number_of_words::NumberOfWords, sort_by_attr::SortByAttr, sum_of_typos::SumOfTypos,
sum_of_words_attribute::SumOfWordsAttribute, sum_of_words_position::SumOfWordsPosition,
words_proximity::WordsProximity, words_proximity::WordsProximity,
sum_of_words_attribute::SumOfWordsAttribute,
sum_of_words_position::SumOfWordsPosition,
exact::Exact,
sort_by_attr::SortByAttr,
document_id::DocumentId,
}; };
pub trait Criterion: Send + Sync { pub trait Criterion: Send + Sync {
@ -62,17 +58,18 @@ impl<T: Criterion + ?Sized> Criterion for Box<T> {
#[derive(Default)] #[derive(Default)]
pub struct CriteriaBuilder<'a> { pub struct CriteriaBuilder<'a> {
inner: Vec<Box<dyn Criterion + 'a>> inner: Vec<Box<dyn Criterion + 'a>>,
} }
impl<'a> CriteriaBuilder<'a> impl<'a> CriteriaBuilder<'a> {
{
pub fn new() -> CriteriaBuilder<'a> { pub fn new() -> CriteriaBuilder<'a> {
CriteriaBuilder { inner: Vec::new() } CriteriaBuilder { inner: Vec::new() }
} }
pub fn with_capacity(capacity: usize) -> CriteriaBuilder<'a> { pub fn with_capacity(capacity: usize) -> CriteriaBuilder<'a> {
CriteriaBuilder { inner: Vec::with_capacity(capacity) } CriteriaBuilder {
inner: Vec::with_capacity(capacity),
}
} }
pub fn reserve(&mut self, additional: usize) { pub fn reserve(&mut self, additional: usize) {
@ -80,14 +77,16 @@ impl<'a> CriteriaBuilder<'a>
} }
pub fn add<C: 'a>(mut self, criterion: C) -> CriteriaBuilder<'a> pub fn add<C: 'a>(mut self, criterion: C) -> CriteriaBuilder<'a>
where C: Criterion, where
C: Criterion,
{ {
self.push(criterion); self.push(criterion);
self self
} }
pub fn push<C: 'a>(&mut self, criterion: C) pub fn push<C: 'a>(&mut self, criterion: C)
where C: Criterion, where
C: Criterion,
{ {
self.inner.push(Box::new(criterion)); self.inner.push(Box::new(criterion));
} }

View File

@ -1,7 +1,7 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::criterion::Criterion; use crate::criterion::Criterion;
use crate::RawDocument; use crate::RawDocument;
use slice_group_by::GroupBy;
use std::cmp::Ordering;
#[inline] #[inline]
fn number_of_query_words(query_index: &[u32]) -> usize { fn number_of_query_words(query_index: &[u32]) -> usize {

View File

@ -2,9 +2,9 @@ use std::cmp::Ordering;
use std::error::Error; use std::error::Error;
use std::fmt; use std::fmt;
use meilidb_schema::{Schema, SchemaAttr};
use crate::criterion::Criterion; use crate::criterion::Criterion;
use crate::{RawDocument, RankedMap}; use crate::{RankedMap, RawDocument};
use meilidb_schema::{Schema, SchemaAttr};
/// An helper struct that permit to sort documents by /// An helper struct that permit to sort documents by
/// some of their stored attributes. /// some of their stored attributes.
@ -51,8 +51,7 @@ impl<'a> SortByAttr<'a> {
ranked_map: &'a RankedMap, ranked_map: &'a RankedMap,
schema: &Schema, schema: &Schema,
attr_name: &str, attr_name: &str,
) -> Result<SortByAttr<'a>, SortByAttrError> ) -> Result<SortByAttr<'a>, SortByAttrError> {
{
SortByAttr::new(ranked_map, schema, attr_name, false) SortByAttr::new(ranked_map, schema, attr_name, false)
} }
@ -60,8 +59,7 @@ impl<'a> SortByAttr<'a> {
ranked_map: &'a RankedMap, ranked_map: &'a RankedMap,
schema: &Schema, schema: &Schema,
attr_name: &str, attr_name: &str,
) -> Result<SortByAttr<'a>, SortByAttrError> ) -> Result<SortByAttr<'a>, SortByAttrError> {
{
SortByAttr::new(ranked_map, schema, attr_name, true) SortByAttr::new(ranked_map, schema, attr_name, true)
} }
@ -70,8 +68,7 @@ impl<'a> SortByAttr<'a> {
schema: &Schema, schema: &Schema,
attr_name: &str, attr_name: &str,
reversed: bool, reversed: bool,
) -> Result<SortByAttr<'a>, SortByAttrError> ) -> Result<SortByAttr<'a>, SortByAttrError> {
{
let attr = match schema.attribute(attr_name) { let attr = match schema.attribute(attr_name) {
Some(attr) => attr, Some(attr) => attr,
None => return Err(SortByAttrError::AttributeNotFound), None => return Err(SortByAttrError::AttributeNotFound),
@ -81,7 +78,11 @@ impl<'a> SortByAttr<'a> {
return Err(SortByAttrError::AttributeNotRegisteredForRanking); return Err(SortByAttrError::AttributeNotRegisteredForRanking);
} }
Ok(SortByAttr { ranked_map, attr, reversed }) Ok(SortByAttr {
ranked_map,
attr,
reversed,
})
} }
} }
@ -93,11 +94,15 @@ impl<'a> Criterion for SortByAttr<'a> {
match (lhs, rhs) { match (lhs, rhs) {
(Some(lhs), Some(rhs)) => { (Some(lhs), Some(rhs)) => {
let order = lhs.cmp(&rhs); let order = lhs.cmp(&rhs);
if self.reversed { order.reverse() } else { order } if self.reversed {
}, order.reverse()
(None, Some(_)) => Ordering::Greater, } else {
(Some(_), None) => Ordering::Less, order
(None, None) => Ordering::Equal, }
}
(None, Some(_)) => Ordering::Greater,
(Some(_), None) => Ordering::Less,
(None, None) => Ordering::Equal,
} }
} }
@ -122,4 +127,4 @@ impl fmt::Display for SortByAttrError {
} }
} }
impl Error for SortByAttrError { } impl Error for SortByAttrError {}

View File

@ -11,10 +11,10 @@ use crate::RawDocument;
#[inline] #[inline]
fn custom_log10(n: u8) -> f32 { fn custom_log10(n: u8) -> f32 {
match n { match n {
0 => 0.0, // log(1) 0 => 0.0, // log(1)
1 => 0.30102, // log(2) 1 => 0.30102, // log(2)
2 => 0.47712, // log(3) 2 => 0.47712, // log(3)
3 => 0.60205, // log(4) 3 => 0.60205, // log(4)
_ => panic!("invalid number"), _ => panic!("invalid number"),
} }
} }

View File

@ -1,7 +1,7 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::criterion::Criterion; use crate::criterion::Criterion;
use crate::RawDocument; use crate::RawDocument;
use slice_group_by::GroupBy;
use std::cmp::Ordering;
#[inline] #[inline]
fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize { fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize {

View File

@ -1,7 +1,7 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::criterion::Criterion; use crate::criterion::Criterion;
use crate::RawDocument; use crate::RawDocument;
use slice_group_by::GroupBy;
use std::cmp::Ordering;
#[inline] #[inline]
fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize { fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize {

View File

@ -1,7 +1,7 @@
use std::cmp::{self, Ordering};
use slice_group_by::GroupBy;
use crate::criterion::Criterion; use crate::criterion::Criterion;
use crate::RawDocument; use crate::RawDocument;
use slice_group_by::GroupBy;
use std::cmp::{self, Ordering};
const MAX_DISTANCE: u16 = 8; const MAX_DISTANCE: u16 = 8;
@ -19,7 +19,9 @@ fn index_proximity(lhs: u16, rhs: u16) -> u16 {
} }
fn attribute_proximity((lattr, lwi): (u16, u16), (rattr, rwi): (u16, u16)) -> u16 { fn attribute_proximity((lattr, lwi): (u16, u16), (rattr, rwi): (u16, u16)) -> u16 {
if lattr != rattr { return MAX_DISTANCE } if lattr != rattr {
return MAX_DISTANCE;
}
index_proximity(lwi, rwi) index_proximity(lwi, rwi)
} }
@ -42,15 +44,18 @@ fn matches_proximity(
distance: &[u8], distance: &[u8],
attribute: &[u16], attribute: &[u16],
word_index: &[u16], word_index: &[u16],
) -> u16 ) -> u16 {
{
let mut query_index_groups = query_index.linear_group(); let mut query_index_groups = query_index.linear_group();
let mut proximity = 0; let mut proximity = 0;
let mut index = 0; let mut index = 0;
let get_attr_wi = |index: usize, group_len: usize| { let get_attr_wi = |index: usize, group_len: usize| {
// retrieve the first distance group (with the lowest values) // retrieve the first distance group (with the lowest values)
let len = distance[index..index + group_len].linear_group().next().unwrap().len(); let len = distance[index..index + group_len]
.linear_group()
.next()
.unwrap()
.len();
let rattr = &attribute[index..index + len]; let rattr = &attribute[index..index + len];
let rwi = &word_index[index..index + len]; let rwi = &word_index[index..index + len];
@ -110,7 +115,6 @@ mod tests {
#[test] #[test]
fn three_different_attributes() { fn three_different_attributes() {
// "soup" "of the" "the day" // "soup" "of the" "the day"
// //
// { id: 0, attr: 0, attr_index: 0 } // { id: 0, attr: 0, attr_index: 0 }
@ -120,19 +124,21 @@ mod tests {
// { id: 3, attr: 3, attr_index: 1 } // { id: 3, attr: 3, attr_index: 1 }
let query_index = &[0, 1, 2, 2, 3]; let query_index = &[0, 1, 2, 2, 3];
let distance = &[0, 0, 0, 0, 0]; let distance = &[0, 0, 0, 0, 0];
let attribute = &[0, 1, 1, 2, 3]; let attribute = &[0, 1, 1, 2, 3];
let word_index = &[0, 0, 1, 0, 1]; let word_index = &[0, 0, 1, 0, 1];
// soup -> of = 8 // soup -> of = 8
// + of -> the = 1 // + of -> the = 1
// + the -> day = 8 (not 1) // + the -> day = 8 (not 1)
assert_eq!(matches_proximity(query_index, distance, attribute, word_index), 17); assert_eq!(
matches_proximity(query_index, distance, attribute, word_index),
17
);
} }
#[test] #[test]
fn two_different_attributes() { fn two_different_attributes() {
// "soup day" "soup of the day" // "soup day" "soup of the day"
// //
// { id: 0, attr: 0, attr_index: 0 } // { id: 0, attr: 0, attr_index: 0 }
@ -143,13 +149,16 @@ mod tests {
// { id: 3, attr: 1, attr_index: 3 } // { id: 3, attr: 1, attr_index: 3 }
let query_index = &[0, 0, 1, 2, 3, 3]; let query_index = &[0, 0, 1, 2, 3, 3];
let distance = &[0, 0, 0, 0, 0, 0]; let distance = &[0, 0, 0, 0, 0, 0];
let attribute = &[0, 1, 1, 1, 0, 1]; let attribute = &[0, 1, 1, 1, 0, 1];
let word_index = &[0, 0, 1, 2, 1, 3]; let word_index = &[0, 0, 1, 2, 1, 3];
// soup -> of = 1 // soup -> of = 1
// + of -> the = 1 // + of -> the = 1
// + the -> day = 1 // + the -> day = 1
assert_eq!(matches_proximity(query_index, distance, attribute, word_index), 3); assert_eq!(
matches_proximity(query_index, distance, attribute, word_index),
3
);
} }
} }

View File

@ -1,13 +1,13 @@
use std::collections::hash_map::{HashMap, Entry}; use std::collections::hash_map::{Entry, HashMap};
use std::fs::File; use std::fs::File;
use std::path::Path; use std::path::Path;
use std::sync::{Arc, RwLock}; use std::sync::{Arc, RwLock};
use std::{fs, thread}; use std::{fs, thread};
use zlmdb::{Result as ZResult, CompactionOption};
use zlmdb::types::{Str, Unit};
use crossbeam_channel::Receiver; use crossbeam_channel::Receiver;
use log::{debug, error}; use log::{debug, error};
use zlmdb::types::{Str, Unit};
use zlmdb::{CompactionOption, Result as ZResult};
use crate::{store, update, Index, MResult}; use crate::{store, update, Index, MResult};
@ -32,20 +32,32 @@ fn update_awaiter(
loop { loop {
let mut writer = match env.write_txn() { let mut writer = match env.write_txn() {
Ok(writer) => writer, Ok(writer) => writer,
Err(e) => { error!("LMDB writer transaction begin failed: {}", e); break } Err(e) => {
error!("LMDB writer transaction begin failed: {}", e);
break;
}
}; };
match update::update_task(&mut writer, index.clone()) { match update::update_task(&mut writer, index.clone()) {
Ok(Some(status)) => { Ok(Some(status)) => {
if let Err(e) = writer.commit() { error!("update transaction failed: {}", e) } if let Err(e) = writer.commit() {
error!("update transaction failed: {}", e)
}
if let Some(ref callback) = *update_fn.load() { if let Some(ref callback) = *update_fn.load() {
(callback)(status); (callback)(status);
} }
}, }
// no more updates to handle for now // no more updates to handle for now
Ok(None) => { debug!("no more updates"); writer.abort(); break }, Ok(None) => {
Err(e) => { error!("update task failed: {}", e); writer.abort() }, debug!("no more updates");
writer.abort();
break;
}
Err(e) => {
error!("update task failed: {}", e);
writer.abort()
}
} }
} }
} }
@ -76,14 +88,16 @@ impl Database {
// open the previously aggregated indexes // open the previously aggregated indexes
let mut indexes = HashMap::new(); let mut indexes = HashMap::new();
for index_name in must_open { for index_name in must_open {
let (sender, receiver) = crossbeam_channel::bounded(100); let (sender, receiver) = crossbeam_channel::bounded(100);
let index = match store::open(&env, &index_name, sender.clone())? { let index = match store::open(&env, &index_name, sender.clone())? {
Some(index) => index, Some(index) => index,
None => { None => {
log::warn!("the index {} doesn't exist or has not all the databases", index_name); log::warn!(
"the index {} doesn't exist or has not all the databases",
index_name
);
continue; continue;
}, }
}; };
let update_fn = Arc::new(ArcSwapFn::empty()); let update_fn = Arc::new(ArcSwapFn::empty());
@ -100,10 +114,18 @@ impl Database {
sender.send(()).unwrap(); sender.send(()).unwrap();
let result = indexes.insert(index_name, (index, update_fn, handle)); let result = indexes.insert(index_name, (index, update_fn, handle));
assert!(result.is_none(), "The index should not have been already open"); assert!(
result.is_none(),
"The index should not have been already open"
);
} }
Ok(Database { env, common_store, indexes_store, indexes: RwLock::new(indexes) }) Ok(Database {
env,
common_store,
indexes_store,
indexes: RwLock::new(indexes),
})
} }
pub fn open_index(&self, name: impl AsRef<str>) -> Option<Index> { pub fn open_index(&self, name: impl AsRef<str>) -> Option<Index> {
@ -152,7 +174,7 @@ impl Database {
let update_fn = Some(Arc::new(update_fn)); let update_fn = Some(Arc::new(update_fn));
current_update_fn.swap(update_fn); current_update_fn.swap(update_fn);
true true
}, }
None => false, None => false,
} }
} }
@ -160,7 +182,10 @@ impl Database {
pub fn unset_update_callback(&self, name: impl AsRef<str>) -> bool { pub fn unset_update_callback(&self, name: impl AsRef<str>) -> bool {
let indexes_lock = self.indexes.read().unwrap(); let indexes_lock = self.indexes.read().unwrap();
match indexes_lock.get(name.as_ref()) { match indexes_lock.get(name.as_ref()) {
Some((_, current_update_fn, _)) => { current_update_fn.swap(None); true }, Some((_, current_update_fn, _)) => {
current_update_fn.swap(None);
true
}
None => false, None => false,
} }
} }

View File

@ -1,5 +1,5 @@
use std::hash::Hash;
use hashbrown::HashMap; use hashbrown::HashMap;
use std::hash::Hash;
pub struct DistinctMap<K> { pub struct DistinctMap<K> {
inner: HashMap<K, usize>, inner: HashMap<K, usize>,

View File

@ -1,6 +1,6 @@
use std::{error, fmt, io}; use crate::serde::{DeserializerError, SerializerError};
use serde_json::Error as SerdeJsonError; use serde_json::Error as SerdeJsonError;
use crate::serde::{SerializerError, DeserializerError}; use std::{error, fmt, io};
pub type MResult<T> = Result<T, Error>; pub type MResult<T> = Result<T, Error>;
@ -90,7 +90,7 @@ impl fmt::Display for Error {
} }
} }
impl error::Error for Error { } impl error::Error for Error {}
#[derive(Debug)] #[derive(Debug)]
pub enum UnsupportedOperation { pub enum UnsupportedOperation {

View File

@ -1,7 +1,9 @@
#[cfg(test)] #[cfg(test)]
#[macro_use] extern crate assert_matches; #[macro_use]
extern crate assert_matches;
mod automaton; mod automaton;
pub mod criterion;
mod database; mod database;
mod distinct_map; mod distinct_map;
mod error; mod error;
@ -9,31 +11,41 @@ mod number;
mod query_builder; mod query_builder;
mod ranked_map; mod ranked_map;
mod raw_document; mod raw_document;
mod reordered_attrs;
mod update;
pub mod criterion;
pub mod raw_indexer; pub mod raw_indexer;
mod reordered_attrs;
pub mod serde; pub mod serde;
pub mod store; pub mod store;
mod update;
pub use self::database::{Database, BoxUpdateFn}; pub use self::database::{BoxUpdateFn, Database};
pub use self::error::{Error, MResult}; pub use self::error::{Error, MResult};
pub use self::number::{Number, ParseNumberError}; pub use self::number::{Number, ParseNumberError};
pub use self::ranked_map::RankedMap; pub use self::ranked_map::RankedMap;
pub use self::raw_document::RawDocument; pub use self::raw_document::RawDocument;
pub use self::store::Index; pub use self::store::Index;
pub use self::update::{UpdateStatus, UpdateResult, UpdateType}; pub use self::update::{UpdateResult, UpdateStatus, UpdateType};
use ::serde::{Deserialize, Serialize};
use zerocopy::{AsBytes, FromBytes}; use zerocopy::{AsBytes, FromBytes};
use ::serde::{Serialize, Deserialize};
/// Represent an internally generated document unique identifier. /// Represent an internally generated document unique identifier.
/// ///
/// It is used to inform the database the document you want to deserialize. /// It is used to inform the database the document you want to deserialize.
/// Helpful for custom ranking. /// Helpful for custom ranking.
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] #[derive(
#[derive(Serialize, Deserialize)] Debug,
#[derive(AsBytes, FromBytes)] Copy,
Clone,
Eq,
PartialEq,
PartialOrd,
Ord,
Hash,
Serialize,
Deserialize,
AsBytes,
FromBytes,
)]
#[repr(C)] #[repr(C)]
pub struct DocumentId(pub u64); pub struct DocumentId(pub u64);
@ -42,8 +54,7 @@ pub struct DocumentId(pub u64);
/// ///
/// This is stored in the map, generated at index time, /// This is stored in the map, generated at index time,
/// extracted and interpreted at search time. /// extracted and interpreted at search time.
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, AsBytes, FromBytes)]
#[derive(AsBytes, FromBytes)]
#[repr(C)] #[repr(C)]
pub struct DocIndex { pub struct DocIndex {
/// The document identifier where the word was found. /// The document identifier where the word was found.
@ -109,7 +120,10 @@ pub struct Document {
impl Document { impl Document {
#[cfg(not(test))] #[cfg(not(test))]
fn from_raw(raw: RawDocument) -> Document { fn from_raw(raw: RawDocument) -> Document {
Document { id: raw.id, highlights: raw.highlights } Document {
id: raw.id,
highlights: raw.highlights,
}
} }
#[cfg(test)] #[cfg(test)]
@ -134,7 +148,11 @@ impl Document {
matches.push(match_); matches.push(match_);
} }
Document { id: raw.id, matches, highlights: raw.highlights } Document {
id: raw.id,
matches,
highlights: raw.highlights,
}
} }
} }

View File

@ -1,12 +1,11 @@
use std::num::{ParseIntError, ParseFloatError};
use std::str::FromStr;
use std::fmt; use std::fmt;
use std::num::{ParseFloatError, ParseIntError};
use std::str::FromStr;
use ordered_float::OrderedFloat; use ordered_float::OrderedFloat;
use serde::{Serialize, Deserialize}; use serde::{Deserialize, Serialize};
#[derive(Serialize, Deserialize)] #[derive(Serialize, Deserialize, Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum Number { pub enum Number {
Unsigned(u64), Unsigned(u64),
Signed(i64), Signed(i64),
@ -32,7 +31,11 @@ impl FromStr for Number {
Err(error) => error, Err(error) => error,
}; };
Err(ParseNumberError { uint_error, int_error, float_error }) Err(ParseNumberError {
uint_error,
int_error,
float_error,
})
} }
} }
@ -46,10 +49,17 @@ pub struct ParseNumberError {
impl fmt::Display for ParseNumberError { impl fmt::Display for ParseNumberError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
if self.uint_error == self.int_error { if self.uint_error == self.int_error {
write!(f, "can not parse number: {}, {}", self.uint_error, self.float_error) write!(
f,
"can not parse number: {}, {}",
self.uint_error, self.float_error
)
} else { } else {
write!(f, "can not parse number: {}, {}, {}", write!(
self.uint_error, self.int_error, self.float_error) f,
"can not parse number: {}, {}, {}",
self.uint_error, self.int_error, self.float_error
)
} }
} }
} }

View File

@ -2,17 +2,17 @@ use hashbrown::HashMap;
use std::mem; use std::mem;
use std::ops::Range; use std::ops::Range;
use std::rc::Rc; use std::rc::Rc;
use std::time::{Instant, Duration}; use std::time::{Duration, Instant};
use fst::{IntoStreamer, Streamer}; use fst::{IntoStreamer, Streamer};
use sdset::SetBuf; use sdset::SetBuf;
use slice_group_by::{GroupBy, GroupByMut}; use slice_group_by::{GroupBy, GroupByMut};
use crate::automaton::{Automaton, AutomatonProducer, QueryEnhancer}; use crate::automaton::{Automaton, AutomatonProducer, QueryEnhancer};
use crate::distinct_map::{DistinctMap, BufferedDistinctMap}; use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
use crate::raw_document::{RawDocument, raw_documents_from}; use crate::raw_document::{raw_documents_from, RawDocument};
use crate::{Document, DocumentId, Highlight, TmpMatch, criterion::Criteria}; use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch};
use crate::{store, MResult, reordered_attrs::ReorderedAttrs}; use crate::{reordered_attrs::ReorderedAttrs, store, MResult};
pub struct QueryBuilder<'c, 'f, 'd> { pub struct QueryBuilder<'c, 'f, 'd> {
criteria: Criteria<'c>, criteria: Criteria<'c>,
@ -29,8 +29,7 @@ pub struct QueryBuilder<'c, 'f, 'd> {
fn multiword_rewrite_matches( fn multiword_rewrite_matches(
mut matches: Vec<(DocumentId, TmpMatch)>, mut matches: Vec<(DocumentId, TmpMatch)>,
query_enhancer: &QueryEnhancer, query_enhancer: &QueryEnhancer,
) -> SetBuf<(DocumentId, TmpMatch)> ) -> SetBuf<(DocumentId, TmpMatch)> {
{
let mut padded_matches = Vec::with_capacity(matches.len()); let mut padded_matches = Vec::with_capacity(matches.len());
// we sort the matches by word index to make them rewritable // we sort the matches by word index to make them rewritable
@ -38,7 +37,6 @@ fn multiword_rewrite_matches(
// for each attribute of each document // for each attribute of each document
for same_document_attribute in matches.linear_group_by_key(|(id, m)| (*id, m.attribute)) { for same_document_attribute in matches.linear_group_by_key(|(id, m)| (*id, m.attribute)) {
// padding will only be applied // padding will only be applied
// to word indices in the same attribute // to word indices in the same attribute
let mut padding = 0; let mut padding = 0;
@ -47,18 +45,20 @@ fn multiword_rewrite_matches(
// for each match at the same position // for each match at the same position
// in this document attribute // in this document attribute
while let Some(same_word_index) = iter.next() { while let Some(same_word_index) = iter.next() {
// find the biggest padding // find the biggest padding
let mut biggest = 0; let mut biggest = 0;
for (id, match_) in same_word_index { for (id, match_) in same_word_index {
let mut replacement = query_enhancer.replacement(match_.query_index); let mut replacement = query_enhancer.replacement(match_.query_index);
let replacement_len = replacement.len(); let replacement_len = replacement.len();
let nexts = iter.remainder().linear_group_by_key(|(_, m)| m.word_index); let nexts = iter.remainder().linear_group_by_key(|(_, m)| m.word_index);
if let Some(query_index) = replacement.next() { if let Some(query_index) = replacement.next() {
let word_index = match_.word_index + padding as u16; let word_index = match_.word_index + padding as u16;
let match_ = TmpMatch { query_index, word_index, ..match_.clone() }; let match_ = TmpMatch {
query_index,
word_index,
..*match_
};
padded_matches.push((*id, match_)); padded_matches.push((*id, match_));
} }
@ -67,22 +67,30 @@ fn multiword_rewrite_matches(
// look ahead and if there already is a match // look ahead and if there already is a match
// corresponding to this padding word, abort the padding // corresponding to this padding word, abort the padding
'padding: for (x, next_group) in nexts.enumerate() { 'padding: for (x, next_group) in nexts.enumerate() {
for (i, query_index) in replacement.clone().enumerate().skip(x) { for (i, query_index) in replacement.clone().enumerate().skip(x) {
let word_index = match_.word_index + padding as u16 + (i + 1) as u16; let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
let padmatch = TmpMatch { query_index, word_index, ..match_.clone() }; let padmatch = TmpMatch {
query_index,
word_index,
..*match_
};
for (_, nmatch_) in next_group { for (_, nmatch_) in next_group {
let mut rep = query_enhancer.replacement(nmatch_.query_index); let mut rep = query_enhancer.replacement(nmatch_.query_index);
let query_index = rep.next().unwrap(); let query_index = rep.next().unwrap();
if query_index == padmatch.query_index { if query_index == padmatch.query_index {
if !found { if !found {
// if we find a corresponding padding for the // if we find a corresponding padding for the
// first time we must push preceding paddings // first time we must push preceding paddings
for (i, query_index) in replacement.clone().enumerate().take(i) { for (i, query_index) in replacement.clone().enumerate().take(i)
let word_index = match_.word_index + padding as u16 + (i + 1) as u16; {
let match_ = TmpMatch { query_index, word_index, ..match_.clone() }; let word_index =
match_.word_index + padding as u16 + (i + 1) as u16;
let match_ = TmpMatch {
query_index,
word_index,
..*match_
};
padded_matches.push((*id, match_)); padded_matches.push((*id, match_));
biggest = biggest.max(i + 1); biggest = biggest.max(i + 1);
} }
@ -97,7 +105,7 @@ fn multiword_rewrite_matches(
// if we do not find a corresponding padding in the // if we do not find a corresponding padding in the
// next groups so stop here and pad what was found // next groups so stop here and pad what was found
break break;
} }
if !found { if !found {
@ -105,7 +113,11 @@ fn multiword_rewrite_matches(
// we must insert the entire padding // we must insert the entire padding
for (i, query_index) in replacement.enumerate() { for (i, query_index) in replacement.enumerate() {
let word_index = match_.word_index + padding as u16 + (i + 1) as u16; let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
let match_ = TmpMatch { query_index, word_index, ..match_.clone() }; let match_ = TmpMatch {
query_index,
word_index,
..*match_
};
padded_matches.push((*id, match_)); padded_matches.push((*id, match_));
} }
@ -129,16 +141,20 @@ fn fetch_raw_documents(
automatons: &[Automaton], automatons: &[Automaton],
query_enhancer: &QueryEnhancer, query_enhancer: &QueryEnhancer,
searchables: Option<&ReorderedAttrs>, searchables: Option<&ReorderedAttrs>,
main_store: &store::Main, main_store: store::Main,
postings_lists_store: &store::PostingsLists, postings_lists_store: store::PostingsLists,
documents_fields_counts_store: &store::DocumentsFieldsCounts, documents_fields_counts_store: store::DocumentsFieldsCounts,
) -> MResult<Vec<RawDocument>> ) -> MResult<Vec<RawDocument>> {
{
let mut matches = Vec::new(); let mut matches = Vec::new();
let mut highlights = Vec::new(); let mut highlights = Vec::new();
for automaton in automatons { for automaton in automatons {
let Automaton { index, is_exact, query_len, .. } = automaton; let Automaton {
index,
is_exact,
query_len,
..
} = automaton;
let dfa = automaton.dfa(); let dfa = automaton.dfa();
let words = match main_store.words_fst(reader)? { let words = match main_store.words_fst(reader)? {
@ -210,8 +226,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
postings_lists: store::PostingsLists, postings_lists: store::PostingsLists,
documents_fields_counts: store::DocumentsFieldsCounts, documents_fields_counts: store::DocumentsFieldsCounts,
synonyms: store::Synonyms, synonyms: store::Synonyms,
) -> QueryBuilder<'c, 'f, 'd> ) -> QueryBuilder<'c, 'f, 'd> {
{
QueryBuilder::with_criteria( QueryBuilder::with_criteria(
main, main,
postings_lists, postings_lists,
@ -227,8 +242,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
documents_fields_counts: store::DocumentsFieldsCounts, documents_fields_counts: store::DocumentsFieldsCounts,
synonyms: store::Synonyms, synonyms: store::Synonyms,
criteria: Criteria<'c>, criteria: Criteria<'c>,
) -> QueryBuilder<'c, 'f, 'd> ) -> QueryBuilder<'c, 'f, 'd> {
{
QueryBuilder { QueryBuilder {
criteria, criteria,
searchable_attrs: None, searchable_attrs: None,
@ -245,7 +259,8 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
pub fn with_filter<F>(&mut self, function: F) pub fn with_filter<F>(&mut self, function: F)
where F: Fn(DocumentId) -> bool + 'f, where
F: Fn(DocumentId) -> bool + 'f,
{ {
self.filter = Some(Box::new(function)) self.filter = Some(Box::new(function))
} }
@ -255,13 +270,16 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
} }
pub fn with_distinct<F, K>(&mut self, function: F, size: usize) pub fn with_distinct<F, K>(&mut self, function: F, size: usize)
where F: Fn(DocumentId) -> Option<u64> + 'd, where
F: Fn(DocumentId) -> Option<u64> + 'd,
{ {
self.distinct = Some((Box::new(function), size)) self.distinct = Some((Box::new(function), size))
} }
pub fn add_searchable_attribute(&mut self, attribute: u16) { pub fn add_searchable_attribute(&mut self, attribute: u16) {
let reorders = self.searchable_attrs.get_or_insert_with(ReorderedAttrs::new); let reorders = self
.searchable_attrs
.get_or_insert_with(ReorderedAttrs::new);
reorders.insert_attribute(attribute); reorders.insert_attribute(attribute);
} }
@ -270,41 +288,36 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
reader: &zlmdb::RoTxn, reader: &zlmdb::RoTxn,
query: &str, query: &str,
range: Range<usize>, range: Range<usize>,
) -> MResult<Vec<Document>> ) -> MResult<Vec<Document>> {
{
match self.distinct { match self.distinct {
Some((distinct, distinct_size)) => { Some((distinct, distinct_size)) => raw_query_with_distinct(
raw_query_with_distinct( reader,
reader, query,
query, range,
range, self.filter,
self.filter, distinct,
distinct, distinct_size,
distinct_size, self.timeout,
self.timeout, self.criteria,
self.criteria, self.searchable_attrs,
self.searchable_attrs, self.main_store,
self.main_store, self.postings_lists_store,
self.postings_lists_store, self.documents_fields_counts_store,
self.documents_fields_counts_store, self.synonyms_store,
self.synonyms_store, ),
) None => raw_query(
}, reader,
None => { query,
raw_query( range,
reader, self.filter,
query, self.timeout,
range, self.criteria,
self.filter, self.searchable_attrs,
self.timeout, self.main_store,
self.criteria, self.postings_lists_store,
self.searchable_attrs, self.documents_fields_counts_store,
self.main_store, self.synonyms_store,
self.postings_lists_store, ),
self.documents_fields_counts_store,
self.synonyms_store,
)
}
} }
} }
} }
@ -326,7 +339,8 @@ fn raw_query<'c, FI>(
documents_fields_counts_store: store::DocumentsFieldsCounts, documents_fields_counts_store: store::DocumentsFieldsCounts,
synonyms_store: store::Synonyms, synonyms_store: store::Synonyms,
) -> MResult<Vec<Document>> ) -> MResult<Vec<Document>>
where FI: Fn(DocumentId) -> bool, where
FI: Fn(DocumentId) -> bool,
{ {
// We delegate the filter work to the distinct query builder, // We delegate the filter work to the distinct query builder,
// specifying a distinct rule that has no effect. // specifying a distinct rule that has no effect.
@ -347,24 +361,20 @@ where FI: Fn(DocumentId) -> bool,
postings_lists_store, postings_lists_store,
documents_fields_counts_store, documents_fields_counts_store,
synonyms_store, synonyms_store,
) );
} }
let start_processing = Instant::now(); let start_processing = Instant::now();
let mut raw_documents_processed = Vec::with_capacity(range.len()); let mut raw_documents_processed = Vec::with_capacity(range.len());
let (automaton_producer, query_enhancer) = AutomatonProducer::new( let (automaton_producer, query_enhancer) =
reader, AutomatonProducer::new(reader, query, main_store, synonyms_store)?;
query,
main_store,
synonyms_store,
)?;
let mut automaton_producer = automaton_producer.into_iter(); let automaton_producer = automaton_producer.into_iter();
let mut automatons = Vec::new(); let mut automatons = Vec::new();
// aggregate automatons groups by groups after time // aggregate automatons groups by groups after time
while let Some(auts) = automaton_producer.next() { for auts in automaton_producer {
automatons.extend(auts); automatons.extend(auts);
// we must retrieve the documents associated // we must retrieve the documents associated
@ -374,15 +384,15 @@ where FI: Fn(DocumentId) -> bool,
&automatons, &automatons,
&query_enhancer, &query_enhancer,
searchable_attrs.as_ref(), searchable_attrs.as_ref(),
&main_store, main_store,
&postings_lists_store, postings_lists_store,
&documents_fields_counts_store, documents_fields_counts_store,
)?; )?;
// stop processing when time is running out // stop processing when time is running out
if let Some(timeout) = timeout { if let Some(timeout) = timeout {
if !raw_documents_processed.is_empty() && start_processing.elapsed() > timeout { if !raw_documents_processed.is_empty() && start_processing.elapsed() > timeout {
break break;
} }
} }
@ -409,20 +419,27 @@ where FI: Fn(DocumentId) -> bool,
// we have sort enough documents if the last document sorted is after // we have sort enough documents if the last document sorted is after
// the end of the requested range, we can continue to the next criterion // the end of the requested range, we can continue to the next criterion
if documents_seen >= range.end { continue 'criteria } if documents_seen >= range.end {
continue 'criteria;
}
} }
} }
} }
// once we classified the documents related to the current // once we classified the documents related to the current
// automatons we save that as the next valid result // automatons we save that as the next valid result
let iter = raw_documents.into_iter().skip(range.start).take(range.len()); let iter = raw_documents
.into_iter()
.skip(range.start)
.take(range.len());
raw_documents_processed.clear(); raw_documents_processed.clear();
raw_documents_processed.extend(iter); raw_documents_processed.extend(iter);
// stop processing when time is running out // stop processing when time is running out
if let Some(timeout) = timeout { if let Some(timeout) = timeout {
if start_processing.elapsed() > timeout { break } if start_processing.elapsed() > timeout {
break;
}
} }
} }
@ -430,7 +447,7 @@ where FI: Fn(DocumentId) -> bool,
// those must be returned // those must be returned
let documents = raw_documents_processed let documents = raw_documents_processed
.into_iter() .into_iter()
.map(|d| Document::from_raw(d)) .map(Document::from_raw)
.collect(); .collect();
Ok(documents) Ok(documents)
@ -456,24 +473,21 @@ fn raw_query_with_distinct<'c, FI, FD>(
documents_fields_counts_store: store::DocumentsFieldsCounts, documents_fields_counts_store: store::DocumentsFieldsCounts,
synonyms_store: store::Synonyms, synonyms_store: store::Synonyms,
) -> MResult<Vec<Document>> ) -> MResult<Vec<Document>>
where FI: Fn(DocumentId) -> bool, where
FD: Fn(DocumentId) -> Option<u64>, FI: Fn(DocumentId) -> bool,
FD: Fn(DocumentId) -> Option<u64>,
{ {
let start_processing = Instant::now(); let start_processing = Instant::now();
let mut raw_documents_processed = Vec::new(); let mut raw_documents_processed = Vec::new();
let (automaton_producer, query_enhancer) = AutomatonProducer::new( let (automaton_producer, query_enhancer) =
reader, AutomatonProducer::new(reader, query, main_store, synonyms_store)?;
query,
main_store,
synonyms_store,
)?;
let mut automaton_producer = automaton_producer.into_iter(); let automaton_producer = automaton_producer.into_iter();
let mut automatons = Vec::new(); let mut automatons = Vec::new();
// aggregate automatons groups by groups after time // aggregate automatons groups by groups after time
while let Some(auts) = automaton_producer.next() { for auts in automaton_producer {
automatons.extend(auts); automatons.extend(auts);
// we must retrieve the documents associated // we must retrieve the documents associated
@ -483,15 +497,15 @@ where FI: Fn(DocumentId) -> bool,
&automatons, &automatons,
&query_enhancer, &query_enhancer,
searchable_attrs.as_ref(), searchable_attrs.as_ref(),
&main_store, main_store,
&postings_lists_store, postings_lists_store,
&documents_fields_counts_store, documents_fields_counts_store,
)?; )?;
// stop processing when time is running out // stop processing when time is running out
if let Some(timeout) = timeout { if let Some(timeout) = timeout {
if !raw_documents_processed.is_empty() && start_processing.elapsed() > timeout { if !raw_documents_processed.is_empty() && start_processing.elapsed() > timeout {
break break;
} }
} }
@ -528,7 +542,7 @@ where FI: Fn(DocumentId) -> bool,
Some(filter) => { Some(filter) => {
let entry = filter_map.entry(document.id); let entry = filter_map.entry(document.id);
*entry.or_insert_with(|| (filter)(document.id)) *entry.or_insert_with(|| (filter)(document.id))
}, }
None => true, None => true,
}; };
@ -543,7 +557,9 @@ where FI: Fn(DocumentId) -> bool,
} }
// the requested range end is reached: stop computing distinct // the requested range end is reached: stop computing distinct
if buf_distinct.len() >= range.end { break } if buf_distinct.len() >= range.end {
break;
}
} }
documents_seen += group.len(); documents_seen += group.len();
@ -558,7 +574,9 @@ where FI: Fn(DocumentId) -> bool,
// we have sort enough documents if the last document sorted is after // we have sort enough documents if the last document sorted is after
// the end of the requested range, we can continue to the next criterion // the end of the requested range, we can continue to the next criterion
if buf_distinct.len() >= range.end { continue 'criteria } if buf_distinct.len() >= range.end {
continue 'criteria;
}
} }
} }
} }
@ -583,14 +601,18 @@ where FI: Fn(DocumentId) -> bool,
if distinct_accepted && seen.len() > range.start { if distinct_accepted && seen.len() > range.start {
raw_documents_processed.push(document); raw_documents_processed.push(document);
if raw_documents_processed.len() == range.len() { break } if raw_documents_processed.len() == range.len() {
break;
}
} }
} }
} }
// stop processing when time is running out // stop processing when time is running out
if let Some(timeout) = timeout { if let Some(timeout) = timeout {
if start_processing.elapsed() > timeout { break } if start_processing.elapsed() > timeout {
break;
}
} }
} }
@ -598,7 +620,7 @@ where FI: Fn(DocumentId) -> bool,
// those must be returned // those must be returned
let documents = raw_documents_processed let documents = raw_documents_processed
.into_iter() .into_iter()
.map(|d| Document::from_raw(d)) .map(Document::from_raw)
.collect(); .collect();
Ok(documents) Ok(documents)
@ -611,20 +633,20 @@ mod tests {
use std::collections::{BTreeSet, HashMap}; use std::collections::{BTreeSet, HashMap};
use std::iter::FromIterator; use std::iter::FromIterator;
use fst::{Set, IntoStreamer}; use fst::{IntoStreamer, Set};
use meilidb_schema::SchemaAttr;
use sdset::SetBuf; use sdset::SetBuf;
use tempfile::TempDir; use tempfile::TempDir;
use meilidb_schema::SchemaAttr;
use crate::automaton::normalize_str; use crate::automaton::normalize_str;
use crate::database::Database; use crate::database::Database;
use crate::DocIndex;
use crate::store::Index; use crate::store::Index;
use crate::DocIndex;
fn set_from_stream<'f, I, S>(stream: I) -> Set fn set_from_stream<'f, I, S>(stream: I) -> Set
where where
I: for<'a> fst::IntoStreamer<'a, Into=S, Item=&'a [u8]>, I: for<'a> fst::IntoStreamer<'a, Into = S, Item = &'a [u8]>,
S: 'f + for<'a> fst::Streamer<'a, Item=&'a [u8]>, S: 'f + for<'a> fst::Streamer<'a, Item = &'a [u8]>,
{ {
let mut builder = fst::SetBuilder::memory(); let mut builder = fst::SetBuilder::memory();
builder.extend_stream(stream).unwrap(); builder.extend_stream(stream).unwrap();
@ -687,14 +709,23 @@ mod tests {
let word = word.to_lowercase(); let word = word.to_lowercase();
let alternatives = match self.index.synonyms.synonyms(&writer, word.as_bytes()).unwrap() { let alternatives = match self
.index
.synonyms
.synonyms(&writer, word.as_bytes())
.unwrap()
{
Some(alternatives) => alternatives, Some(alternatives) => alternatives,
None => fst::Set::default(), None => fst::Set::default(),
}; };
let new = sdset_into_fstset(&new); let new = sdset_into_fstset(&new);
let new_alternatives = set_from_stream(alternatives.op().add(new.into_stream()).r#union()); let new_alternatives =
self.index.synonyms.put_synonyms(&mut writer, word.as_bytes(), &new_alternatives).unwrap(); set_from_stream(alternatives.op().add(new.into_stream()).r#union());
self.index
.synonyms
.put_synonyms(&mut writer, word.as_bytes(), &new_alternatives)
.unwrap();
let synonyms = match self.index.main.synonyms_fst(&writer).unwrap() { let synonyms = match self.index.main.synonyms_fst(&writer).unwrap() {
Some(synonyms) => synonyms, Some(synonyms) => synonyms,
@ -702,14 +733,17 @@ mod tests {
}; };
let synonyms_fst = insert_key(&synonyms, word.as_bytes()); let synonyms_fst = insert_key(&synonyms, word.as_bytes());
self.index.main.put_synonyms_fst(&mut writer, &synonyms_fst).unwrap(); self.index
.main
.put_synonyms_fst(&mut writer, &synonyms_fst)
.unwrap();
writer.commit().unwrap(); writer.commit().unwrap();
} }
} }
impl<'a> FromIterator<(&'a str, &'a [DocIndex])> for TempDatabase { impl<'a> FromIterator<(&'a str, &'a [DocIndex])> for TempDatabase {
fn from_iter<I: IntoIterator<Item=(&'a str, &'a [DocIndex])>>(iter: I) -> Self { fn from_iter<I: IntoIterator<Item = (&'a str, &'a [DocIndex])>>(iter: I) -> Self {
let tempdir = TempDir::new().unwrap(); let tempdir = TempDir::new().unwrap();
let database = Database::open_or_create(&tempdir).unwrap(); let database = Database::open_or_create(&tempdir).unwrap();
let index = database.create_index("default").unwrap(); let index = database.create_index("default").unwrap();
@ -724,7 +758,10 @@ mod tests {
for (word, indexes) in iter { for (word, indexes) in iter {
let word = word.to_lowercase().into_bytes(); let word = word.to_lowercase().into_bytes();
words_fst.insert(word.clone()); words_fst.insert(word.clone());
postings_lists.entry(word).or_insert_with(Vec::new).extend_from_slice(indexes); postings_lists
.entry(word)
.or_insert_with(Vec::new)
.extend_from_slice(indexes);
for idx in indexes { for idx in indexes {
fields_counts.insert((idx.document_id, idx.attribute, idx.word_index), 1); fields_counts.insert((idx.document_id, idx.attribute, idx.word_index), 1);
} }
@ -736,31 +773,33 @@ mod tests {
for (word, postings_list) in postings_lists { for (word, postings_list) in postings_lists {
let postings_list = SetBuf::from_dirty(postings_list); let postings_list = SetBuf::from_dirty(postings_list);
index.postings_lists.put_postings_list(&mut writer, &word, &postings_list).unwrap(); index
.postings_lists
.put_postings_list(&mut writer, &word, &postings_list)
.unwrap();
} }
for ((docid, attr, _), count) in fields_counts { for ((docid, attr, _), count) in fields_counts {
let prev = index.documents_fields_counts let prev = index
.document_field_count( .documents_fields_counts
&mut writer, .document_field_count(&mut writer, docid, SchemaAttr(attr))
docid, .unwrap();
SchemaAttr(attr),
).unwrap();
let prev = prev.unwrap_or(0); let prev = prev.unwrap_or(0);
index.documents_fields_counts index
.put_document_field_count( .documents_fields_counts
&mut writer, .put_document_field_count(&mut writer, docid, SchemaAttr(attr), prev + count)
docid, .unwrap();
SchemaAttr(attr),
prev + count,
).unwrap();
} }
writer.commit().unwrap(); writer.commit().unwrap();
TempDatabase { database, index, _tempdir: tempdir } TempDatabase {
database,
index,
_tempdir: tempdir,
}
} }
} }
@ -768,8 +807,8 @@ mod tests {
fn simple() { fn simple() {
let store = TempDatabase::from_iter(vec![ let store = TempDatabase::from_iter(vec![
("iphone", &[doc_char_index(0, 0, 0)][..]), ("iphone", &[doc_char_index(0, 0, 0)][..]),
("from", &[doc_char_index(0, 1, 1)][..]), ("from", &[doc_char_index(0, 1, 1)][..]),
("apple", &[doc_char_index(0, 2, 2)][..]), ("apple", &[doc_char_index(0, 2, 2)][..]),
]); ]);
let env = &store.database.env; let env = &store.database.env;
@ -791,9 +830,7 @@ mod tests {
#[test] #[test]
fn simple_synonyms() { fn simple_synonyms() {
let mut store = TempDatabase::from_iter(vec![ let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]);
("hello", &[doc_index(0, 0)][..]),
]);
store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"])); store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"]));
@ -825,9 +862,7 @@ mod tests {
#[test] #[test]
fn prefix_synonyms() { fn prefix_synonyms() {
let mut store = TempDatabase::from_iter(vec![ let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]);
("hello", &[doc_index(0, 0)][..]),
]);
store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"])); store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"]));
store.add_synonym("salut", SetBuf::from_dirty(vec!["hello"])); store.add_synonym("salut", SetBuf::from_dirty(vec!["hello"]));
@ -872,9 +907,7 @@ mod tests {
#[test] #[test]
fn levenshtein_synonyms() { fn levenshtein_synonyms() {
let mut store = TempDatabase::from_iter(vec![ let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]);
("hello", &[doc_index(0, 0)][..]),
]);
store.add_synonym("salutation", SetBuf::from_dirty(vec!["hello"])); store.add_synonym("salutation", SetBuf::from_dirty(vec!["hello"]));
@ -907,9 +940,9 @@ mod tests {
#[test] #[test]
fn harder_synonyms() { fn harder_synonyms() {
let mut store = TempDatabase::from_iter(vec![ let mut store = TempDatabase::from_iter(vec![
("hello", &[doc_index(0, 0)][..]), ("hello", &[doc_index(0, 0)][..]),
("bonjour", &[doc_index(1, 3)]), ("bonjour", &[doc_index(1, 3)]),
("salut", &[doc_index(2, 5)]), ("salut", &[doc_index(2, 5)]),
]); ]);
store.add_synonym("hello", SetBuf::from_dirty(vec!["bonjour", "salut"])); store.add_synonym("hello", SetBuf::from_dirty(vec!["bonjour", "salut"]));
@ -987,17 +1020,22 @@ mod tests {
/// Unique word has multi-word synonyms /// Unique word has multi-word synonyms
fn unique_to_multiword_synonyms() { fn unique_to_multiword_synonyms() {
let mut store = TempDatabase::from_iter(vec![ let mut store = TempDatabase::from_iter(vec![
("new", &[doc_char_index(0, 0, 0)][..]), ("new", &[doc_char_index(0, 0, 0)][..]),
("york", &[doc_char_index(0, 1, 1)][..]), ("york", &[doc_char_index(0, 1, 1)][..]),
("city", &[doc_char_index(0, 2, 2)][..]), ("city", &[doc_char_index(0, 2, 2)][..]),
("subway", &[doc_char_index(0, 3, 3)][..]), ("subway", &[doc_char_index(0, 3, 3)][..]),
("NY", &[doc_char_index(1, 0, 0)][..]),
("NY", &[doc_char_index(1, 0, 0)][..]),
("subway", &[doc_char_index(1, 1, 1)][..]), ("subway", &[doc_char_index(1, 1, 1)][..]),
]); ]);
store.add_synonym("NY", SetBuf::from_dirty(vec!["NYC", "new york", "new york city"])); store.add_synonym(
store.add_synonym("NYC", SetBuf::from_dirty(vec!["NY", "new york", "new york city"])); "NY",
SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]),
);
store.add_synonym(
"NYC",
SetBuf::from_dirty(vec!["NY", "new york", "new york city"]),
);
let env = &store.database.env; let env = &store.database.env;
let reader = env.read_txn().unwrap(); let reader = env.read_txn().unwrap();
@ -1056,20 +1094,18 @@ mod tests {
#[test] #[test]
fn unique_to_multiword_synonyms_words_proximity() { fn unique_to_multiword_synonyms_words_proximity() {
let mut store = TempDatabase::from_iter(vec![ let mut store = TempDatabase::from_iter(vec![
("new", &[doc_char_index(0, 0, 0)][..]), ("new", &[doc_char_index(0, 0, 0)][..]),
("york", &[doc_char_index(0, 1, 1)][..]), ("york", &[doc_char_index(0, 1, 1)][..]),
("city", &[doc_char_index(0, 2, 2)][..]), ("city", &[doc_char_index(0, 2, 2)][..]),
("subway", &[doc_char_index(0, 3, 3)][..]), ("subway", &[doc_char_index(0, 3, 3)][..]),
("york", &[doc_char_index(1, 0, 0)][..]),
("york", &[doc_char_index(1, 0, 0)][..]), ("new", &[doc_char_index(1, 1, 1)][..]),
("new", &[doc_char_index(1, 1, 1)][..]),
("subway", &[doc_char_index(1, 2, 2)][..]), ("subway", &[doc_char_index(1, 2, 2)][..]),
("NY", &[doc_char_index(2, 0, 0)][..]),
("NY", &[doc_char_index(2, 0, 0)][..]),
("subway", &[doc_char_index(2, 1, 1)][..]), ("subway", &[doc_char_index(2, 1, 1)][..]),
]); ]);
store.add_synonym("NY", SetBuf::from_dirty(vec!["york new"])); store.add_synonym("NY", SetBuf::from_dirty(vec!["york new"]));
let env = &store.database.env; let env = &store.database.env;
let reader = env.read_txn().unwrap(); let reader = env.read_txn().unwrap();
@ -1120,11 +1156,10 @@ mod tests {
#[test] #[test]
fn unique_to_multiword_synonyms_cumulative_word_index() { fn unique_to_multiword_synonyms_cumulative_word_index() {
let mut store = TempDatabase::from_iter(vec![ let mut store = TempDatabase::from_iter(vec![
("NY", &[doc_char_index(0, 0, 0)][..]), ("NY", &[doc_char_index(0, 0, 0)][..]),
("subway", &[doc_char_index(0, 1, 1)][..]), ("subway", &[doc_char_index(0, 1, 1)][..]),
("new", &[doc_char_index(1, 0, 0)][..]),
("new", &[doc_char_index(1, 0, 0)][..]), ("york", &[doc_char_index(1, 1, 1)][..]),
("york", &[doc_char_index(1, 1, 1)][..]),
("subway", &[doc_char_index(1, 2, 2)][..]), ("subway", &[doc_char_index(1, 2, 2)][..]),
]); ]);
@ -1175,20 +1210,25 @@ mod tests {
/// Unique word has multi-word synonyms /// Unique word has multi-word synonyms
fn harder_unique_to_multiword_synonyms_one() { fn harder_unique_to_multiword_synonyms_one() {
let mut store = TempDatabase::from_iter(vec![ let mut store = TempDatabase::from_iter(vec![
("new", &[doc_char_index(0, 0, 0)][..]), ("new", &[doc_char_index(0, 0, 0)][..]),
("york", &[doc_char_index(0, 1, 1)][..]), ("york", &[doc_char_index(0, 1, 1)][..]),
("city", &[doc_char_index(0, 2, 2)][..]), ("city", &[doc_char_index(0, 2, 2)][..]),
("yellow", &[doc_char_index(0, 3, 3)][..]), ("yellow", &[doc_char_index(0, 3, 3)][..]),
("subway", &[doc_char_index(0, 4, 4)][..]), ("subway", &[doc_char_index(0, 4, 4)][..]),
("broken", &[doc_char_index(0, 5, 5)][..]), ("broken", &[doc_char_index(0, 5, 5)][..]),
("NY", &[doc_char_index(1, 0, 0)][..]),
("NY", &[doc_char_index(1, 0, 0)][..]), ("blue", &[doc_char_index(1, 1, 1)][..]),
("blue", &[doc_char_index(1, 1, 1)][..]), ("subway", &[doc_char_index(1, 2, 2)][..]),
("subway", &[doc_char_index(1, 2, 2)][..]),
]); ]);
store.add_synonym("NY", SetBuf::from_dirty(vec!["NYC", "new york", "new york city"])); store.add_synonym(
store.add_synonym("NYC", SetBuf::from_dirty(vec!["NY", "new york", "new york city"])); "NY",
SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]),
);
store.add_synonym(
"NYC",
SetBuf::from_dirty(vec!["NY", "new york", "new york city"]),
);
let env = &store.database.env; let env = &store.database.env;
let reader = env.read_txn().unwrap(); let reader = env.read_txn().unwrap();
@ -1249,21 +1289,26 @@ mod tests {
/// Unique word has multi-word synonyms /// Unique word has multi-word synonyms
fn even_harder_unique_to_multiword_synonyms() { fn even_harder_unique_to_multiword_synonyms() {
let mut store = TempDatabase::from_iter(vec![ let mut store = TempDatabase::from_iter(vec![
("new", &[doc_char_index(0, 0, 0)][..]), ("new", &[doc_char_index(0, 0, 0)][..]),
("york", &[doc_char_index(0, 1, 1)][..]), ("york", &[doc_char_index(0, 1, 1)][..]),
("city", &[doc_char_index(0, 2, 2)][..]), ("city", &[doc_char_index(0, 2, 2)][..]),
("yellow", &[doc_char_index(0, 3, 3)][..]), ("yellow", &[doc_char_index(0, 3, 3)][..]),
("underground", &[doc_char_index(0, 4, 4)][..]), ("underground", &[doc_char_index(0, 4, 4)][..]),
("train", &[doc_char_index(0, 5, 5)][..]), ("train", &[doc_char_index(0, 5, 5)][..]),
("broken", &[doc_char_index(0, 6, 6)][..]), ("broken", &[doc_char_index(0, 6, 6)][..]),
("NY", &[doc_char_index(1, 0, 0)][..]),
("NY", &[doc_char_index(1, 0, 0)][..]), ("blue", &[doc_char_index(1, 1, 1)][..]),
("blue", &[doc_char_index(1, 1, 1)][..]), ("subway", &[doc_char_index(1, 2, 2)][..]),
("subway", &[doc_char_index(1, 2, 2)][..]),
]); ]);
store.add_synonym("NY", SetBuf::from_dirty(vec!["NYC", "new york", "new york city"])); store.add_synonym(
store.add_synonym("NYC", SetBuf::from_dirty(vec!["NY", "new york", "new york city"])); "NY",
SetBuf::from_dirty(vec!["NYC", "new york", "new york city"]),
);
store.add_synonym(
"NYC",
SetBuf::from_dirty(vec!["NY", "new york", "new york city"]),
);
store.add_synonym("subway", SetBuf::from_dirty(vec!["underground train"])); store.add_synonym("subway", SetBuf::from_dirty(vec!["underground train"]));
let env = &store.database.env; let env = &store.database.env;
@ -1330,30 +1375,36 @@ mod tests {
/// Multi-word has multi-word synonyms /// Multi-word has multi-word synonyms
fn multiword_to_multiword_synonyms() { fn multiword_to_multiword_synonyms() {
let mut store = TempDatabase::from_iter(vec![ let mut store = TempDatabase::from_iter(vec![
("NY", &[doc_char_index(0, 0, 0)][..]), ("NY", &[doc_char_index(0, 0, 0)][..]),
("subway", &[doc_char_index(0, 1, 1)][..]), ("subway", &[doc_char_index(0, 1, 1)][..]),
("NYC", &[doc_char_index(1, 0, 0)][..]),
("NYC", &[doc_char_index(1, 0, 0)][..]), ("blue", &[doc_char_index(1, 1, 1)][..]),
("blue", &[doc_char_index(1, 1, 1)][..]), ("subway", &[doc_char_index(1, 2, 2)][..]),
("subway", &[doc_char_index(1, 2, 2)][..]), ("broken", &[doc_char_index(1, 3, 3)][..]),
("broken", &[doc_char_index(1, 3, 3)][..]), ("new", &[doc_char_index(2, 0, 0)][..]),
("york", &[doc_char_index(2, 1, 1)][..]),
("new", &[doc_char_index(2, 0, 0)][..]),
("york", &[doc_char_index(2, 1, 1)][..]),
("underground", &[doc_char_index(2, 2, 2)][..]), ("underground", &[doc_char_index(2, 2, 2)][..]),
("train", &[doc_char_index(2, 3, 3)][..]), ("train", &[doc_char_index(2, 3, 3)][..]),
("broken", &[doc_char_index(2, 4, 4)][..]), ("broken", &[doc_char_index(2, 4, 4)][..]),
]); ]);
store.add_synonym("new york", SetBuf::from_dirty(vec![ "NYC", "NY", "new york city" ])); store.add_synonym(
store.add_synonym("new york city", SetBuf::from_dirty(vec![ "NYC", "NY", "new york" ])); "new york",
store.add_synonym("underground train", SetBuf::from_dirty(vec![ "subway" ])); SetBuf::from_dirty(vec!["NYC", "NY", "new york city"]),
);
store.add_synonym(
"new york city",
SetBuf::from_dirty(vec!["NYC", "NY", "new york"]),
);
store.add_synonym("underground train", SetBuf::from_dirty(vec!["subway"]));
let env = &store.database.env; let env = &store.database.env;
let reader = env.read_txn().unwrap(); let reader = env.read_txn().unwrap();
let builder = store.query_builder(); let builder = store.query_builder();
let results = builder.query(&reader, "new york underground train broken", 0..20).unwrap(); let results = builder
.query(&reader, "new york underground train broken", 0..20)
.unwrap();
let mut iter = results.into_iter(); let mut iter = results.into_iter();
assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => {
@ -1390,7 +1441,9 @@ mod tests {
assert_matches!(iter.next(), None); assert_matches!(iter.next(), None);
let builder = store.query_builder(); let builder = store.query_builder();
let results = builder.query(&reader, "new york city underground train broken", 0..20).unwrap(); let results = builder
.query(&reader, "new york city underground train broken", 0..20)
.unwrap();
let mut iter = results.into_iter(); let mut iter = results.into_iter();
assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => {
@ -1436,14 +1489,14 @@ mod tests {
#[test] #[test]
fn intercrossed_multiword_synonyms() { fn intercrossed_multiword_synonyms() {
let mut store = TempDatabase::from_iter(vec![ let mut store = TempDatabase::from_iter(vec![
("new", &[doc_index(0, 0)][..]), ("new", &[doc_index(0, 0)][..]),
("york", &[doc_index(0, 1)][..]), ("york", &[doc_index(0, 1)][..]),
("big", &[doc_index(0, 2)][..]), ("big", &[doc_index(0, 2)][..]),
("city", &[doc_index(0, 3)][..]), ("city", &[doc_index(0, 3)][..]),
]); ]);
store.add_synonym("new york", SetBuf::from_dirty(vec![ "new york city" ])); store.add_synonym("new york", SetBuf::from_dirty(vec!["new york city"]));
store.add_synonym("new york city", SetBuf::from_dirty(vec![ "new york" ])); store.add_synonym("new york city", SetBuf::from_dirty(vec!["new york"]));
let env = &store.database.env; let env = &store.database.env;
let reader = env.read_txn().unwrap(); let reader = env.read_txn().unwrap();
@ -1469,16 +1522,14 @@ mod tests {
assert_matches!(iter.next(), None); assert_matches!(iter.next(), None);
let mut store = TempDatabase::from_iter(vec![ let mut store = TempDatabase::from_iter(vec![
("NY", &[doc_index(0, 0)][..]), ("NY", &[doc_index(0, 0)][..]),
("city", &[doc_index(0, 1)][..]), ("city", &[doc_index(0, 1)][..]),
("subway", &[doc_index(0, 2)][..]), ("subway", &[doc_index(0, 2)][..]),
("NY", &[doc_index(1, 0)][..]),
("NY", &[doc_index(1, 0)][..]),
("subway", &[doc_index(1, 1)][..]), ("subway", &[doc_index(1, 1)][..]),
("NY", &[doc_index(2, 0)][..]),
("NY", &[doc_index(2, 0)][..]), ("york", &[doc_index(2, 1)][..]),
("york", &[doc_index(2, 1)][..]), ("city", &[doc_index(2, 2)][..]),
("city", &[doc_index(2, 2)][..]),
("subway", &[doc_index(2, 3)][..]), ("subway", &[doc_index(2, 3)][..]),
]); ]);
@ -1525,20 +1576,22 @@ mod tests {
#[test] #[test]
fn cumulative_word_indices() { fn cumulative_word_indices() {
let mut store = TempDatabase::from_iter(vec![ let mut store = TempDatabase::from_iter(vec![
("NYC", &[doc_index(0, 0)][..]), ("NYC", &[doc_index(0, 0)][..]),
("long", &[doc_index(0, 1)][..]), ("long", &[doc_index(0, 1)][..]),
("subway", &[doc_index(0, 2)][..]), ("subway", &[doc_index(0, 2)][..]),
("cool", &[doc_index(0, 3)][..]), ("cool", &[doc_index(0, 3)][..]),
]); ]);
store.add_synonym("new york city", SetBuf::from_dirty(vec!["NYC"])); store.add_synonym("new york city", SetBuf::from_dirty(vec!["NYC"]));
store.add_synonym("subway", SetBuf::from_dirty(vec!["underground train"])); store.add_synonym("subway", SetBuf::from_dirty(vec!["underground train"]));
let env = &store.database.env; let env = &store.database.env;
let reader = env.read_txn().unwrap(); let reader = env.read_txn().unwrap();
let builder = store.query_builder(); let builder = store.query_builder();
let results = builder.query(&reader, "new york city long subway cool ", 0..20).unwrap(); let results = builder
.query(&reader, "new york city long subway cool ", 0..20)
.unwrap();
let mut iter = results.into_iter(); let mut iter = results.into_iter();
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
@ -1560,8 +1613,7 @@ mod tests {
let mut store = TempDatabase::from_iter(vec![ let mut store = TempDatabase::from_iter(vec![
("telephone", &[doc_index(0, 0)][..]), // meilidb indexes the unidecoded ("telephone", &[doc_index(0, 0)][..]), // meilidb indexes the unidecoded
("téléphone", &[doc_index(0, 0)][..]), // and the original words on the same DocIndex ("téléphone", &[doc_index(0, 0)][..]), // and the original words on the same DocIndex
("iphone", &[doc_index(1, 0)][..]),
("iphone", &[doc_index(1, 0)][..]),
]); ]);
store.add_synonym("téléphone", SetBuf::from_dirty(vec!["iphone"])); store.add_synonym("téléphone", SetBuf::from_dirty(vec!["iphone"]));
@ -1624,8 +1676,8 @@ mod tests {
#[test] #[test]
fn simple_concatenation() { fn simple_concatenation() {
let store = TempDatabase::from_iter(vec![ let store = TempDatabase::from_iter(vec![
("iphone", &[doc_index(0, 0)][..]), ("iphone", &[doc_index(0, 0)][..]),
("case", &[doc_index(0, 1)][..]), ("case", &[doc_index(0, 1)][..]),
]); ]);
let env = &store.database.env; let env = &store.database.env;

View File

@ -2,12 +2,11 @@ use std::io::{Read, Write};
use hashbrown::HashMap; use hashbrown::HashMap;
use meilidb_schema::SchemaAttr; use meilidb_schema::SchemaAttr;
use serde::{Serialize, Deserialize}; use serde::{Deserialize, Serialize};
use crate::{DocumentId, Number}; use crate::{DocumentId, Number};
#[derive(Debug, Default, Clone, PartialEq, Eq)] #[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Serialize, Deserialize)]
#[serde(transparent)] #[serde(transparent)]
pub struct RankedMap(HashMap<(DocumentId, SchemaAttr), Number>); pub struct RankedMap(HashMap<(DocumentId, SchemaAttr), Number>);
@ -16,6 +15,10 @@ impl RankedMap {
self.0.len() self.0.len()
} }
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
pub fn insert(&mut self, document: DocumentId, attribute: SchemaAttr, number: Number) { pub fn insert(&mut self, document: DocumentId, attribute: SchemaAttr, number: Number) {
self.0.insert((document, attribute), number); self.0.insert((document, attribute), number);
} }

View File

@ -1,11 +1,11 @@
use std::sync::Arc;
use std::fmt; use std::fmt;
use std::sync::Arc;
use meilidb_schema::SchemaAttr; use meilidb_schema::SchemaAttr;
use sdset::SetBuf; use sdset::SetBuf;
use slice_group_by::GroupBy; use slice_group_by::GroupBy;
use crate::{TmpMatch, DocumentId, Highlight}; use crate::{DocumentId, Highlight, TmpMatch};
#[derive(Clone)] #[derive(Clone)]
pub struct RawDocument { pub struct RawDocument {
@ -20,7 +20,13 @@ impl RawDocument {
let r = self.matches.range; let r = self.matches.range;
// it is safe because construction/modifications // it is safe because construction/modifications
// can only be done in this module // can only be done in this module
unsafe { &self.matches.matches.query_index.get_unchecked(r.start..r.end) } unsafe {
&self
.matches
.matches
.query_index
.get_unchecked(r.start..r.end)
}
} }
pub fn distance(&self) -> &[u8] { pub fn distance(&self) -> &[u8] {
@ -41,7 +47,13 @@ impl RawDocument {
let r = self.matches.range; let r = self.matches.range;
// it is safe because construction/modifications // it is safe because construction/modifications
// can only be done in this module // can only be done in this module
unsafe { &self.matches.matches.word_index.get_unchecked(r.start..r.end) } unsafe {
&self
.matches
.matches
.word_index
.get_unchecked(r.start..r.end)
}
} }
pub fn is_exact(&self) -> &[bool] { pub fn is_exact(&self) -> &[bool] {
@ -55,12 +67,32 @@ impl RawDocument {
impl fmt::Debug for RawDocument { impl fmt::Debug for RawDocument {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.write_str("RawDocument {\r\n")?; f.write_str("RawDocument {\r\n")?;
f.write_fmt(format_args!("{:>15}: {:?},\r\n", "id", self.id))?; f.write_fmt(format_args!("{:>15}: {:?},\r\n", "id", self.id))?;
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "query_index", self.query_index()))?; f.write_fmt(format_args!(
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "distance", self.distance()))?; "{:>15}: {:^5?},\r\n",
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "attribute", self.attribute()))?; "query_index",
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "word_index", self.word_index()))?; self.query_index()
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "is_exact", self.is_exact()))?; ))?;
f.write_fmt(format_args!(
"{:>15}: {:^5?},\r\n",
"distance",
self.distance()
))?;
f.write_fmt(format_args!(
"{:>15}: {:^5?},\r\n",
"attribute",
self.attribute()
))?;
f.write_fmt(format_args!(
"{:>15}: {:^5?},\r\n",
"word_index",
self.word_index()
))?;
f.write_fmt(format_args!(
"{:>15}: {:^5?},\r\n",
"is_exact",
self.is_exact()
))?;
f.write_str("}")?; f.write_str("}")?;
Ok(()) Ok(())
} }
@ -70,8 +102,7 @@ pub fn raw_documents_from(
matches: SetBuf<(DocumentId, TmpMatch)>, matches: SetBuf<(DocumentId, TmpMatch)>,
highlights: SetBuf<(DocumentId, Highlight)>, highlights: SetBuf<(DocumentId, Highlight)>,
fields_counts: SetBuf<(DocumentId, SchemaAttr, u64)>, fields_counts: SetBuf<(DocumentId, SchemaAttr, u64)>,
) -> Vec<RawDocument> ) -> Vec<RawDocument> {
{
let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new(); let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new();
let mut matches2 = Matches::with_capacity(matches.len()); let mut matches2 = Matches::with_capacity(matches.len());
@ -94,10 +125,21 @@ pub fn raw_documents_from(
} }
let matches = Arc::new(matches2); let matches = Arc::new(matches2);
docs_ranges.into_iter().map(|(id, range, highlights, fields_counts)| { docs_ranges
let matches = SharedMatches { range, matches: matches.clone() }; .into_iter()
RawDocument { id, matches, highlights, fields_counts } .map(|(id, range, highlights, fields_counts)| {
}).collect() let matches = SharedMatches {
range,
matches: matches.clone(),
};
RawDocument {
id,
matches,
highlights,
fields_counts,
}
})
.collect()
} }
#[derive(Debug, Copy, Clone)] #[derive(Debug, Copy, Clone)]

View File

@ -1,10 +1,10 @@
use std::collections::{BTreeMap, HashMap}; use std::collections::{BTreeMap, HashMap};
use std::convert::TryFrom; use std::convert::TryFrom;
use crate::{DocIndex, DocumentId};
use deunicode::deunicode_with_tofu; use deunicode::deunicode_with_tofu;
use crate::{DocumentId, DocIndex};
use meilidb_schema::SchemaAttr; use meilidb_schema::SchemaAttr;
use meilidb_tokenizer::{is_cjk, Tokenizer, SeqTokenizer, Token}; use meilidb_tokenizer::{is_cjk, SeqTokenizer, Token, Tokenizer};
use sdset::SetBuf; use sdset::SetBuf;
type Word = Vec<u8>; // TODO make it be a SmallVec type Word = Vec<u8>; // TODO make it be a SmallVec
@ -60,7 +60,9 @@ impl RawIndexer {
&mut self.docs_words, &mut self.docs_words,
); );
if !must_continue { break } if !must_continue {
break;
}
number_of_words += 1; number_of_words += 1;
} }
@ -70,8 +72,9 @@ impl RawIndexer {
} }
pub fn index_text_seq<'a, I, IT>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I) pub fn index_text_seq<'a, I, IT>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I)
where I: IntoIterator<Item=&'a str, IntoIter=IT>, where
IT: Iterator<Item = &'a str> + Clone, I: IntoIterator<Item = &'a str, IntoIter = IT>,
IT: Iterator<Item = &'a str> + Clone,
{ {
// TODO serialize this to one call to the SeqTokenizer loop // TODO serialize this to one call to the SeqTokenizer loop
@ -88,14 +91,25 @@ impl RawIndexer {
&mut self.docs_words, &mut self.docs_words,
); );
if !must_continue { break } if !must_continue {
break;
}
} }
let deunicoded: Vec<_> = lowercased.into_iter().map(|lowercase_text| { let deunicoded: Vec<_> = lowercased
if lowercase_text.contains(is_cjk) { return lowercase_text } .into_iter()
let deunicoded = deunicode_with_tofu(&lowercase_text, ""); .map(|lowercase_text| {
if lowercase_text != deunicoded { deunicoded } else { lowercase_text } if lowercase_text.contains(is_cjk) {
}).collect(); return lowercase_text;
}
let deunicoded = deunicode_with_tofu(&lowercase_text, "");
if lowercase_text != deunicoded {
deunicoded
} else {
lowercase_text
}
})
.collect();
let iter = deunicoded.iter().map(|t| t.as_str()); let iter = deunicoded.iter().map(|t| t.as_str());
for token in SeqTokenizer::new(iter) { for token in SeqTokenizer::new(iter) {
@ -108,17 +122,21 @@ impl RawIndexer {
&mut self.docs_words, &mut self.docs_words,
); );
if !must_continue { break } if !must_continue {
break;
}
} }
} }
pub fn build(self) -> Indexed { pub fn build(self) -> Indexed {
let words_doc_indexes = self.words_doc_indexes let words_doc_indexes = self
.words_doc_indexes
.into_iter() .into_iter()
.map(|(word, indexes)| (word, SetBuf::from_dirty(indexes))) .map(|(word, indexes)| (word, SetBuf::from_dirty(indexes)))
.collect(); .collect();
let docs_words = self.docs_words let docs_words = self
.docs_words
.into_iter() .into_iter()
.map(|(id, mut words)| { .map(|(id, mut words)| {
words.sort_unstable(); words.sort_unstable();
@ -127,7 +145,16 @@ impl RawIndexer {
}) })
.collect(); .collect();
Indexed { words_doc_indexes, docs_words } Indexed {
words_doc_indexes,
docs_words,
}
}
}
impl Default for RawIndexer {
fn default() -> Self {
Self::new()
} }
} }
@ -138,16 +165,20 @@ fn index_token(
word_limit: usize, word_limit: usize,
words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>, words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
docs_words: &mut HashMap<DocumentId, Vec<Word>>, docs_words: &mut HashMap<DocumentId, Vec<Word>>,
) -> bool ) -> bool {
{ if token.word_index >= word_limit {
if token.word_index >= word_limit { return false } return false;
}
match token_to_docindex(id, attr, token) { match token_to_docindex(id, attr, token) {
Some(docindex) => { Some(docindex) => {
let word = Vec::from(token.word); let word = Vec::from(token.word);
words_doc_indexes.entry(word.clone()).or_insert_with(Vec::new).push(docindex); words_doc_indexes
.entry(word.clone())
.or_insert_with(Vec::new)
.push(docindex);
docs_words.entry(id).or_insert_with(Vec::new).push(word); docs_words.entry(id).or_insert_with(Vec::new).push(word);
}, }
None => return false, None => return false,
} }
@ -183,7 +214,9 @@ mod tests {
let text = "Zut, laspirateur, jai oublié de léteindre !"; let text = "Zut, laspirateur, jai oublié de léteindre !";
indexer.index_text(docid, attr, text); indexer.index_text(docid, attr, text);
let Indexed { words_doc_indexes, .. } = indexer.build(); let Indexed {
words_doc_indexes, ..
} = indexer.build();
assert!(words_doc_indexes.get(&b"l"[..]).is_some()); assert!(words_doc_indexes.get(&b"l"[..]).is_some());
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some()); assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
@ -191,7 +224,9 @@ mod tests {
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some()); assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
// with the ugly apostrophe... // with the ugly apostrophe...
assert!(words_doc_indexes.get(&"léteindre".to_owned().into_bytes()).is_some()); assert!(words_doc_indexes
.get(&"léteindre".to_owned().into_bytes())
.is_some());
} }
#[test] #[test]
@ -203,7 +238,9 @@ mod tests {
let text = vec!["Zut, laspirateur, jai oublié de léteindre !"]; let text = vec!["Zut, laspirateur, jai oublié de léteindre !"];
indexer.index_text_seq(docid, attr, text); indexer.index_text_seq(docid, attr, text);
let Indexed { words_doc_indexes, .. } = indexer.build(); let Indexed {
words_doc_indexes, ..
} = indexer.build();
assert!(words_doc_indexes.get(&b"l"[..]).is_some()); assert!(words_doc_indexes.get(&b"l"[..]).is_some());
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some()); assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
@ -211,6 +248,8 @@ mod tests {
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some()); assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
// with the ugly apostrophe... // with the ugly apostrophe...
assert!(words_doc_indexes.get(&"léteindre".to_owned().into_bytes()).is_some()); assert!(words_doc_indexes
.get(&"léteindre".to_owned().into_bytes())
.is_some());
} }
} }

View File

@ -6,7 +6,10 @@ pub struct ReorderedAttrs {
impl ReorderedAttrs { impl ReorderedAttrs {
pub fn new() -> ReorderedAttrs { pub fn new() -> ReorderedAttrs {
ReorderedAttrs { count: 0, reorders: Vec::new() } ReorderedAttrs {
count: 0,
reorders: Vec::new(),
}
} }
pub fn insert_attribute(&mut self, attribute: u16) { pub fn insert_attribute(&mut self, attribute: u16) {

View File

@ -77,13 +77,18 @@ impl ser::Serializer for ConvertToNumber {
} }
fn serialize_none(self) -> Result<Self::Ok, Self::Error> { fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnrankableType { type_name: "Option" }) Err(SerializerError::UnrankableType {
type_name: "Option",
})
} }
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error> fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize, where
T: Serialize,
{ {
Err(SerializerError::UnrankableType { type_name: "Option" }) Err(SerializerError::UnrankableType {
type_name: "Option",
})
} }
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> { fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
@ -91,25 +96,29 @@ impl ser::Serializer for ConvertToNumber {
} }
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> { fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnrankableType { type_name: "unit struct" }) Err(SerializerError::UnrankableType {
type_name: "unit struct",
})
} }
fn serialize_unit_variant( fn serialize_unit_variant(
self, self,
_name: &'static str, _name: &'static str,
_variant_index: u32, _variant_index: u32,
_variant: &'static str _variant: &'static str,
) -> Result<Self::Ok, Self::Error> ) -> Result<Self::Ok, Self::Error> {
{ Err(SerializerError::UnrankableType {
Err(SerializerError::UnrankableType { type_name: "unit variant" }) type_name: "unit variant",
})
} }
fn serialize_newtype_struct<T: ?Sized>( fn serialize_newtype_struct<T: ?Sized>(
self, self,
_name: &'static str, _name: &'static str,
value: &T value: &T,
) -> Result<Self::Ok, Self::Error> ) -> Result<Self::Ok, Self::Error>
where T: Serialize, where
T: Serialize,
{ {
value.serialize(self) value.serialize(self)
} }
@ -119,15 +128,20 @@ impl ser::Serializer for ConvertToNumber {
_name: &'static str, _name: &'static str,
_variant_index: u32, _variant_index: u32,
_variant: &'static str, _variant: &'static str,
_value: &T _value: &T,
) -> Result<Self::Ok, Self::Error> ) -> Result<Self::Ok, Self::Error>
where T: Serialize, where
T: Serialize,
{ {
Err(SerializerError::UnrankableType { type_name: "newtype variant" }) Err(SerializerError::UnrankableType {
type_name: "newtype variant",
})
} }
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> { fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnrankableType { type_name: "sequence" }) Err(SerializerError::UnrankableType {
type_name: "sequence",
})
} }
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> { fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
@ -137,10 +151,11 @@ impl ser::Serializer for ConvertToNumber {
fn serialize_tuple_struct( fn serialize_tuple_struct(
self, self,
_name: &'static str, _name: &'static str,
_len: usize _len: usize,
) -> Result<Self::SerializeTupleStruct, Self::Error> ) -> Result<Self::SerializeTupleStruct, Self::Error> {
{ Err(SerializerError::UnrankableType {
Err(SerializerError::UnrankableType { type_name: "tuple struct" }) type_name: "tuple struct",
})
} }
fn serialize_tuple_variant( fn serialize_tuple_variant(
@ -148,10 +163,11 @@ impl ser::Serializer for ConvertToNumber {
_name: &'static str, _name: &'static str,
_variant_index: u32, _variant_index: u32,
_variant: &'static str, _variant: &'static str,
_len: usize _len: usize,
) -> Result<Self::SerializeTupleVariant, Self::Error> ) -> Result<Self::SerializeTupleVariant, Self::Error> {
{ Err(SerializerError::UnrankableType {
Err(SerializerError::UnrankableType { type_name: "tuple variant" }) type_name: "tuple variant",
})
} }
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> { fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
@ -161,10 +177,11 @@ impl ser::Serializer for ConvertToNumber {
fn serialize_struct( fn serialize_struct(
self, self,
_name: &'static str, _name: &'static str,
_len: usize _len: usize,
) -> Result<Self::SerializeStruct, Self::Error> ) -> Result<Self::SerializeStruct, Self::Error> {
{ Err(SerializerError::UnrankableType {
Err(SerializerError::UnrankableType { type_name: "struct" }) type_name: "struct",
})
} }
fn serialize_struct_variant( fn serialize_struct_variant(
@ -172,9 +189,10 @@ impl ser::Serializer for ConvertToNumber {
_name: &'static str, _name: &'static str,
_variant_index: u32, _variant_index: u32,
_variant: &'static str, _variant: &'static str,
_len: usize _len: usize,
) -> Result<Self::SerializeStructVariant, Self::Error> ) -> Result<Self::SerializeStructVariant, Self::Error> {
{ Err(SerializerError::UnrankableType {
Err(SerializerError::UnrankableType { type_name: "struct variant" }) type_name: "struct variant",
})
} }
} }

View File

@ -1,5 +1,5 @@
use serde::Serialize;
use serde::ser; use serde::ser;
use serde::Serialize;
use super::SerializerError; use super::SerializerError;
@ -17,7 +17,9 @@ impl ser::Serializer for ConvertToString {
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>; type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> { fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "boolean" }) Err(SerializerError::UnserializableType {
type_name: "boolean",
})
} }
fn serialize_char(self, value: char) -> Result<Self::Ok, Self::Error> { fn serialize_char(self, value: char) -> Result<Self::Ok, Self::Error> {
@ -73,13 +75,18 @@ impl ser::Serializer for ConvertToString {
} }
fn serialize_none(self) -> Result<Self::Ok, Self::Error> { fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "Option" }) Err(SerializerError::UnserializableType {
type_name: "Option",
})
} }
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error> fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize, where
T: Serialize,
{ {
Err(SerializerError::UnserializableType { type_name: "Option" }) Err(SerializerError::UnserializableType {
type_name: "Option",
})
} }
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> { fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
@ -87,25 +94,29 @@ impl ser::Serializer for ConvertToString {
} }
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> { fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "unit struct" }) Err(SerializerError::UnserializableType {
type_name: "unit struct",
})
} }
fn serialize_unit_variant( fn serialize_unit_variant(
self, self,
_name: &'static str, _name: &'static str,
_variant_index: u32, _variant_index: u32,
_variant: &'static str _variant: &'static str,
) -> Result<Self::Ok, Self::Error> ) -> Result<Self::Ok, Self::Error> {
{ Err(SerializerError::UnserializableType {
Err(SerializerError::UnserializableType { type_name: "unit variant" }) type_name: "unit variant",
})
} }
fn serialize_newtype_struct<T: ?Sized>( fn serialize_newtype_struct<T: ?Sized>(
self, self,
_name: &'static str, _name: &'static str,
value: &T value: &T,
) -> Result<Self::Ok, Self::Error> ) -> Result<Self::Ok, Self::Error>
where T: Serialize, where
T: Serialize,
{ {
value.serialize(self) value.serialize(self)
} }
@ -115,15 +126,20 @@ impl ser::Serializer for ConvertToString {
_name: &'static str, _name: &'static str,
_variant_index: u32, _variant_index: u32,
_variant: &'static str, _variant: &'static str,
_value: &T _value: &T,
) -> Result<Self::Ok, Self::Error> ) -> Result<Self::Ok, Self::Error>
where T: Serialize, where
T: Serialize,
{ {
Err(SerializerError::UnserializableType { type_name: "newtype variant" }) Err(SerializerError::UnserializableType {
type_name: "newtype variant",
})
} }
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> { fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "sequence" }) Err(SerializerError::UnserializableType {
type_name: "sequence",
})
} }
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> { fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
@ -133,10 +149,11 @@ impl ser::Serializer for ConvertToString {
fn serialize_tuple_struct( fn serialize_tuple_struct(
self, self,
_name: &'static str, _name: &'static str,
_len: usize _len: usize,
) -> Result<Self::SerializeTupleStruct, Self::Error> ) -> Result<Self::SerializeTupleStruct, Self::Error> {
{ Err(SerializerError::UnserializableType {
Err(SerializerError::UnserializableType { type_name: "tuple struct" }) type_name: "tuple struct",
})
} }
fn serialize_tuple_variant( fn serialize_tuple_variant(
@ -144,10 +161,11 @@ impl ser::Serializer for ConvertToString {
_name: &'static str, _name: &'static str,
_variant_index: u32, _variant_index: u32,
_variant: &'static str, _variant: &'static str,
_len: usize _len: usize,
) -> Result<Self::SerializeTupleVariant, Self::Error> ) -> Result<Self::SerializeTupleVariant, Self::Error> {
{ Err(SerializerError::UnserializableType {
Err(SerializerError::UnserializableType { type_name: "tuple variant" }) type_name: "tuple variant",
})
} }
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> { fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
@ -157,10 +175,11 @@ impl ser::Serializer for ConvertToString {
fn serialize_struct( fn serialize_struct(
self, self,
_name: &'static str, _name: &'static str,
_len: usize _len: usize,
) -> Result<Self::SerializeStruct, Self::Error> ) -> Result<Self::SerializeStruct, Self::Error> {
{ Err(SerializerError::UnserializableType {
Err(SerializerError::UnserializableType { type_name: "struct" }) type_name: "struct",
})
} }
fn serialize_struct_variant( fn serialize_struct_variant(
@ -168,9 +187,10 @@ impl ser::Serializer for ConvertToString {
_name: &'static str, _name: &'static str,
_variant_index: u32, _variant_index: u32,
_variant: &'static str, _variant: &'static str,
_len: usize _len: usize,
) -> Result<Self::SerializeStructVariant, Self::Error> ) -> Result<Self::SerializeStructVariant, Self::Error> {
{ Err(SerializerError::UnserializableType {
Err(SerializerError::UnserializableType { type_name: "struct variant" }) type_name: "struct variant",
})
} }
} }

View File

@ -1,12 +1,12 @@
use std::collections::HashSet; use std::collections::HashSet;
use std::io::Cursor; use std::io::Cursor;
use std::{fmt, error::Error}; use std::{error::Error, fmt};
use meilidb_schema::{Schema, SchemaAttr}; use meilidb_schema::{Schema, SchemaAttr};
use serde_json::Error as SerdeJsonError;
use serde_json::Deserializer as SerdeJsonDeserializer;
use serde_json::de::IoRead as SerdeJsonIoRead;
use serde::{de, forward_to_deserialize_any}; use serde::{de, forward_to_deserialize_any};
use serde_json::de::IoRead as SerdeJsonIoRead;
use serde_json::Deserializer as SerdeJsonDeserializer;
use serde_json::Error as SerdeJsonError;
use crate::store::DocumentsFields; use crate::store::DocumentsFields;
use crate::DocumentId; use crate::DocumentId;
@ -60,7 +60,8 @@ impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> {
type Error = DeserializerError; type Error = DeserializerError;
fn deserialize_any<V>(self, visitor: V) -> Result<V::Value, Self::Error> fn deserialize_any<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: de::Visitor<'de> where
V: de::Visitor<'de>,
{ {
self.deserialize_map(visitor) self.deserialize_map(visitor)
} }
@ -72,16 +73,21 @@ impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> {
} }
fn deserialize_map<V>(self, visitor: V) -> Result<V::Value, Self::Error> fn deserialize_map<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: de::Visitor<'de> where
V: de::Visitor<'de>,
{ {
let mut error = None; let mut error = None;
let iter = self.documents_fields let iter = self
.documents_fields
.document_fields(self.reader, self.document_id)? .document_fields(self.reader, self.document_id)?
.filter_map(|result| { .filter_map(|result| {
let (attr, value) = match result { let (attr, value) = match result {
Ok(value) => value, Ok(value) => value,
Err(e) => { error = Some(e); return None }, Err(e) => {
error = Some(e);
return None;
}
}; };
let is_displayed = self.schema.props(attr).is_displayed(); let is_displayed = self.schema.props(attr).is_displayed();
@ -99,7 +105,9 @@ impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> {
}); });
let map_deserializer = de::value::MapDeserializer::new(iter); let map_deserializer = de::value::MapDeserializer::new(iter);
let result = visitor.visit_map(map_deserializer).map_err(DeserializerError::from); let result = visitor
.visit_map(map_deserializer)
.map_err(DeserializerError::from);
match error.take() { match error.take() {
Some(error) => Err(error.into()), Some(error) => Err(error.into()),
@ -122,7 +130,8 @@ impl<'de> de::Deserializer<'de> for Value {
type Error = SerdeJsonError; type Error = SerdeJsonError;
fn deserialize_any<V>(mut self, visitor: V) -> Result<V::Value, Self::Error> fn deserialize_any<V>(mut self, visitor: V) -> Result<V::Value, Self::Error>
where V: de::Visitor<'de> where
V: de::Visitor<'de>,
{ {
self.0.deserialize_any(visitor) self.0.deserialize_any(visitor)
} }

View File

@ -5,13 +5,14 @@ use serde::{ser, Serialize};
use serde_json::Value; use serde_json::Value;
use siphasher::sip::SipHasher; use siphasher::sip::SipHasher;
use super::{SerializerError, ConvertToString}; use super::{ConvertToString, SerializerError};
pub fn extract_document_id<D>( pub fn extract_document_id<D>(
identifier: &str, identifier: &str,
document: &D, document: &D,
) -> Result<Option<DocumentId>, SerializerError> ) -> Result<Option<DocumentId>, SerializerError>
where D: serde::Serialize, where
D: serde::Serialize,
{ {
let serializer = ExtractDocumentId { identifier }; let serializer = ExtractDocumentId { identifier };
document.serialize(serializer) document.serialize(serializer)
@ -77,13 +78,18 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> {
} }
fn serialize_none(self) -> Result<Self::Ok, Self::Error> { fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "Option" }) Err(SerializerError::UnserializableType {
type_name: "Option",
})
} }
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error> fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize, where
T: Serialize,
{ {
Err(SerializerError::UnserializableType { type_name: "Option" }) Err(SerializerError::UnserializableType {
type_name: "Option",
})
} }
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> { fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
@ -91,25 +97,29 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> {
} }
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> { fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "unit struct" }) Err(SerializerError::UnserializableType {
type_name: "unit struct",
})
} }
fn serialize_unit_variant( fn serialize_unit_variant(
self, self,
_name: &'static str, _name: &'static str,
_variant_index: u32, _variant_index: u32,
_variant: &'static str _variant: &'static str,
) -> Result<Self::Ok, Self::Error> ) -> Result<Self::Ok, Self::Error> {
{ Err(SerializerError::UnserializableType {
Err(SerializerError::UnserializableType { type_name: "unit variant" }) type_name: "unit variant",
})
} }
fn serialize_newtype_struct<T: ?Sized>( fn serialize_newtype_struct<T: ?Sized>(
self, self,
_name: &'static str, _name: &'static str,
value: &T value: &T,
) -> Result<Self::Ok, Self::Error> ) -> Result<Self::Ok, Self::Error>
where T: Serialize, where
T: Serialize,
{ {
value.serialize(self) value.serialize(self)
} }
@ -119,15 +129,20 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> {
_name: &'static str, _name: &'static str,
_variant_index: u32, _variant_index: u32,
_variant: &'static str, _variant: &'static str,
_value: &T _value: &T,
) -> Result<Self::Ok, Self::Error> ) -> Result<Self::Ok, Self::Error>
where T: Serialize, where
T: Serialize,
{ {
Err(SerializerError::UnserializableType { type_name: "newtype variant" }) Err(SerializerError::UnserializableType {
type_name: "newtype variant",
})
} }
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> { fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "sequence" }) Err(SerializerError::UnserializableType {
type_name: "sequence",
})
} }
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> { fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
@ -137,10 +152,11 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> {
fn serialize_tuple_struct( fn serialize_tuple_struct(
self, self,
_name: &'static str, _name: &'static str,
_len: usize _len: usize,
) -> Result<Self::SerializeTupleStruct, Self::Error> ) -> Result<Self::SerializeTupleStruct, Self::Error> {
{ Err(SerializerError::UnserializableType {
Err(SerializerError::UnserializableType { type_name: "tuple struct" }) type_name: "tuple struct",
})
} }
fn serialize_tuple_variant( fn serialize_tuple_variant(
@ -148,10 +164,11 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> {
_name: &'static str, _name: &'static str,
_variant_index: u32, _variant_index: u32,
_variant: &'static str, _variant: &'static str,
_len: usize _len: usize,
) -> Result<Self::SerializeTupleVariant, Self::Error> ) -> Result<Self::SerializeTupleVariant, Self::Error> {
{ Err(SerializerError::UnserializableType {
Err(SerializerError::UnserializableType { type_name: "tuple variant" }) type_name: "tuple variant",
})
} }
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> { fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
@ -167,9 +184,8 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> {
fn serialize_struct( fn serialize_struct(
self, self,
_name: &'static str, _name: &'static str,
_len: usize _len: usize,
) -> Result<Self::SerializeStruct, Self::Error> ) -> Result<Self::SerializeStruct, Self::Error> {
{
let serializer = ExtractDocumentIdStructSerializer { let serializer = ExtractDocumentIdStructSerializer {
identifier: self.identifier, identifier: self.identifier,
document_id: None, document_id: None,
@ -183,10 +199,11 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> {
_name: &'static str, _name: &'static str,
_variant_index: u32, _variant_index: u32,
_variant: &'static str, _variant: &'static str,
_len: usize _len: usize,
) -> Result<Self::SerializeStructVariant, Self::Error> ) -> Result<Self::SerializeStructVariant, Self::Error> {
{ Err(SerializerError::UnserializableType {
Err(SerializerError::UnserializableType { type_name: "struct variant" }) type_name: "struct variant",
})
} }
} }
@ -201,7 +218,8 @@ impl<'a> ser::SerializeMap for ExtractDocumentIdMapSerializer<'a> {
type Error = SerializerError; type Error = SerializerError;
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error> fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
where T: Serialize, where
T: Serialize,
{ {
let key = key.serialize(ConvertToString)?; let key = key.serialize(ConvertToString)?;
self.current_key_name = Some(key); self.current_key_name = Some(key);
@ -209,7 +227,8 @@ impl<'a> ser::SerializeMap for ExtractDocumentIdMapSerializer<'a> {
} }
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error> fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: Serialize, where
T: Serialize,
{ {
let key = self.current_key_name.take().unwrap(); let key = self.current_key_name.take().unwrap();
self.serialize_entry(&key, value) self.serialize_entry(&key, value)
@ -218,9 +237,11 @@ impl<'a> ser::SerializeMap for ExtractDocumentIdMapSerializer<'a> {
fn serialize_entry<K: ?Sized, V: ?Sized>( fn serialize_entry<K: ?Sized, V: ?Sized>(
&mut self, &mut self,
key: &K, key: &K,
value: &V value: &V,
) -> Result<(), Self::Error> ) -> Result<(), Self::Error>
where K: Serialize, V: Serialize, where
K: Serialize,
V: Serialize,
{ {
let key = key.serialize(ConvertToString)?; let key = key.serialize(ConvertToString)?;
@ -252,9 +273,10 @@ impl<'a> ser::SerializeStruct for ExtractDocumentIdStructSerializer<'a> {
fn serialize_field<T: ?Sized>( fn serialize_field<T: ?Sized>(
&mut self, &mut self,
key: &'static str, key: &'static str,
value: &T value: &T,
) -> Result<(), Self::Error> ) -> Result<(), Self::Error>
where T: Serialize, where
T: Serialize,
{ {
if self.identifier == key { if self.identifier == key {
let value = serde_json::to_string(value).and_then(|s| serde_json::from_str(&s))?; let value = serde_json::to_string(value).and_then(|s| serde_json::from_str(&s))?;

View File

@ -2,9 +2,9 @@ use meilidb_schema::SchemaAttr;
use serde::ser; use serde::ser;
use serde::Serialize; use serde::Serialize;
use crate::DocumentId; use super::{ConvertToString, SerializerError};
use crate::raw_indexer::RawIndexer; use crate::raw_indexer::RawIndexer;
use super::{SerializerError, ConvertToString}; use crate::DocumentId;
pub struct Indexer<'a> { pub struct Indexer<'a> {
pub attribute: SchemaAttr, pub attribute: SchemaAttr,
@ -24,7 +24,9 @@ impl<'a> ser::Serializer for Indexer<'a> {
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>; type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> { fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnindexableType { type_name: "boolean" }) Err(SerializerError::UnindexableType {
type_name: "boolean",
})
} }
fn serialize_char(self, value: char) -> Result<Self::Ok, Self::Error> { fn serialize_char(self, value: char) -> Result<Self::Ok, Self::Error> {
@ -83,7 +85,9 @@ impl<'a> ser::Serializer for Indexer<'a> {
} }
fn serialize_str(self, text: &str) -> Result<Self::Ok, Self::Error> { fn serialize_str(self, text: &str) -> Result<Self::Ok, Self::Error> {
let number_of_words = self.indexer.index_text(self.document_id, self.attribute, text); let number_of_words = self
.indexer
.index_text(self.document_id, self.attribute, text);
Ok(Some(number_of_words)) Ok(Some(number_of_words))
} }
@ -92,14 +96,19 @@ impl<'a> ser::Serializer for Indexer<'a> {
} }
fn serialize_none(self) -> Result<Self::Ok, Self::Error> { fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnindexableType { type_name: "Option" }) Err(SerializerError::UnindexableType {
type_name: "Option",
})
} }
fn serialize_some<T: ?Sized>(self, value: &T) -> Result<Self::Ok, Self::Error> fn serialize_some<T: ?Sized>(self, value: &T) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize, where
T: ser::Serialize,
{ {
let text = value.serialize(ConvertToString)?; let text = value.serialize(ConvertToString)?;
let number_of_words = self.indexer.index_text(self.document_id, self.attribute, &text); let number_of_words = self
.indexer
.index_text(self.document_id, self.attribute, &text);
Ok(Some(number_of_words)) Ok(Some(number_of_words))
} }
@ -108,25 +117,29 @@ impl<'a> ser::Serializer for Indexer<'a> {
} }
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> { fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnindexableType { type_name: "unit struct" }) Err(SerializerError::UnindexableType {
type_name: "unit struct",
})
} }
fn serialize_unit_variant( fn serialize_unit_variant(
self, self,
_name: &'static str, _name: &'static str,
_variant_index: u32, _variant_index: u32,
_variant: &'static str _variant: &'static str,
) -> Result<Self::Ok, Self::Error> ) -> Result<Self::Ok, Self::Error> {
{ Err(SerializerError::UnindexableType {
Err(SerializerError::UnindexableType { type_name: "unit variant" }) type_name: "unit variant",
})
} }
fn serialize_newtype_struct<T: ?Sized>( fn serialize_newtype_struct<T: ?Sized>(
self, self,
_name: &'static str, _name: &'static str,
value: &T value: &T,
) -> Result<Self::Ok, Self::Error> ) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize, where
T: ser::Serialize,
{ {
value.serialize(self) value.serialize(self)
} }
@ -136,11 +149,14 @@ impl<'a> ser::Serializer for Indexer<'a> {
_name: &'static str, _name: &'static str,
_variant_index: u32, _variant_index: u32,
_variant: &'static str, _variant: &'static str,
_value: &T _value: &T,
) -> Result<Self::Ok, Self::Error> ) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize, where
T: ser::Serialize,
{ {
Err(SerializerError::UnindexableType { type_name: "newtype variant" }) Err(SerializerError::UnindexableType {
type_name: "newtype variant",
})
} }
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> { fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
@ -168,10 +184,11 @@ impl<'a> ser::Serializer for Indexer<'a> {
fn serialize_tuple_struct( fn serialize_tuple_struct(
self, self,
_name: &'static str, _name: &'static str,
_len: usize _len: usize,
) -> Result<Self::SerializeTupleStruct, Self::Error> ) -> Result<Self::SerializeTupleStruct, Self::Error> {
{ Err(SerializerError::UnindexableType {
Err(SerializerError::UnindexableType { type_name: "tuple struct" }) type_name: "tuple struct",
})
} }
fn serialize_tuple_variant( fn serialize_tuple_variant(
@ -179,10 +196,11 @@ impl<'a> ser::Serializer for Indexer<'a> {
_name: &'static str, _name: &'static str,
_variant_index: u32, _variant_index: u32,
_variant: &'static str, _variant: &'static str,
_len: usize _len: usize,
) -> Result<Self::SerializeTupleVariant, Self::Error> ) -> Result<Self::SerializeTupleVariant, Self::Error> {
{ Err(SerializerError::UnindexableType {
Err(SerializerError::UnindexableType { type_name: "tuple variant" }) type_name: "tuple variant",
})
} }
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> { fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
@ -199,10 +217,11 @@ impl<'a> ser::Serializer for Indexer<'a> {
fn serialize_struct( fn serialize_struct(
self, self,
_name: &'static str, _name: &'static str,
_len: usize _len: usize,
) -> Result<Self::SerializeStruct, Self::Error> ) -> Result<Self::SerializeStruct, Self::Error> {
{ Err(SerializerError::UnindexableType {
Err(SerializerError::UnindexableType { type_name: "struct" }) type_name: "struct",
})
} }
fn serialize_struct_variant( fn serialize_struct_variant(
@ -210,10 +229,11 @@ impl<'a> ser::Serializer for Indexer<'a> {
_name: &'static str, _name: &'static str,
_variant_index: u32, _variant_index: u32,
_variant: &'static str, _variant: &'static str,
_len: usize _len: usize,
) -> Result<Self::SerializeStructVariant, Self::Error> ) -> Result<Self::SerializeStructVariant, Self::Error> {
{ Err(SerializerError::UnindexableType {
Err(SerializerError::UnindexableType { type_name: "struct variant" }) type_name: "struct variant",
})
} }
} }
@ -229,7 +249,8 @@ impl<'a> ser::SerializeSeq for SeqIndexer<'a> {
type Error = SerializerError; type Error = SerializerError;
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error> fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: ser::Serialize where
T: ser::Serialize,
{ {
let text = value.serialize(ConvertToString)?; let text = value.serialize(ConvertToString)?;
self.texts.push(text); self.texts.push(text);
@ -238,7 +259,8 @@ impl<'a> ser::SerializeSeq for SeqIndexer<'a> {
fn end(self) -> Result<Self::Ok, Self::Error> { fn end(self) -> Result<Self::Ok, Self::Error> {
let texts = self.texts.iter().map(String::as_str); let texts = self.texts.iter().map(String::as_str);
self.indexer.index_text_seq(self.document_id, self.attribute, texts); self.indexer
.index_text_seq(self.document_id, self.attribute, texts);
Ok(None) Ok(None)
} }
} }
@ -255,7 +277,8 @@ impl<'a> ser::SerializeMap for MapIndexer<'a> {
type Error = SerializerError; type Error = SerializerError;
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error> fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
where T: ser::Serialize, where
T: ser::Serialize,
{ {
let text = key.serialize(ConvertToString)?; let text = key.serialize(ConvertToString)?;
self.texts.push(text); self.texts.push(text);
@ -263,7 +286,8 @@ impl<'a> ser::SerializeMap for MapIndexer<'a> {
} }
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error> fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: ser::Serialize, where
T: ser::Serialize,
{ {
let text = value.serialize(ConvertToString)?; let text = value.serialize(ConvertToString)?;
self.texts.push(text); self.texts.push(text);
@ -272,7 +296,8 @@ impl<'a> ser::SerializeMap for MapIndexer<'a> {
fn end(self) -> Result<Self::Ok, Self::Error> { fn end(self) -> Result<Self::Ok, Self::Error> {
let texts = self.texts.iter().map(String::as_str); let texts = self.texts.iter().map(String::as_str);
self.indexer.index_text_seq(self.document_id, self.attribute, texts); self.indexer
.index_text_seq(self.document_id, self.attribute, texts);
Ok(None) Ok(None)
} }
} }
@ -293,7 +318,8 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
key: &'static str, key: &'static str,
value: &T, value: &T,
) -> Result<(), Self::Error> ) -> Result<(), Self::Error>
where T: ser::Serialize, where
T: ser::Serialize,
{ {
let key_text = key.to_owned(); let key_text = key.to_owned();
let value_text = value.serialize(ConvertToString)?; let value_text = value.serialize(ConvertToString)?;
@ -304,7 +330,8 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
fn end(self) -> Result<Self::Ok, Self::Error> { fn end(self) -> Result<Self::Ok, Self::Error> {
let texts = self.texts.iter().map(String::as_str); let texts = self.texts.iter().map(String::as_str);
self.indexer.index_text_seq(self.document_id, self.attribute, texts); self.indexer
.index_text_seq(self.document_id, self.attribute, texts);
Ok(None) Ok(None)
} }
} }
@ -321,7 +348,8 @@ impl<'a> ser::SerializeTuple for TupleIndexer<'a> {
type Error = SerializerError; type Error = SerializerError;
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error> fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: Serialize where
T: Serialize,
{ {
let text = value.serialize(ConvertToString)?; let text = value.serialize(ConvertToString)?;
self.texts.push(text); self.texts.push(text);
@ -330,7 +358,8 @@ impl<'a> ser::SerializeTuple for TupleIndexer<'a> {
fn end(self) -> Result<Self::Ok, Self::Error> { fn end(self) -> Result<Self::Ok, Self::Error> {
let texts = self.texts.iter().map(String::as_str); let texts = self.texts.iter().map(String::as_str);
self.indexer.index_text_seq(self.document_id, self.attribute, texts); self.indexer
.index_text_seq(self.document_id, self.attribute, texts);
Ok(None) Ok(None)
} }
} }

View File

@ -15,19 +15,19 @@ mod extract_document_id;
mod indexer; mod indexer;
mod serializer; mod serializer;
pub use self::deserializer::{Deserializer, DeserializerError};
pub use self::extract_document_id::{extract_document_id, compute_document_id, value_to_string};
pub use self::convert_to_string::ConvertToString;
pub use self::convert_to_number::ConvertToNumber; pub use self::convert_to_number::ConvertToNumber;
pub use self::convert_to_string::ConvertToString;
pub use self::deserializer::{Deserializer, DeserializerError};
pub use self::extract_document_id::{compute_document_id, extract_document_id, value_to_string};
pub use self::indexer::Indexer; pub use self::indexer::Indexer;
pub use self::serializer::Serializer; pub use self::serializer::Serializer;
use std::collections::BTreeMap; use std::collections::BTreeMap;
use std::{fmt, error::Error}; use std::{error::Error, fmt};
use meilidb_schema::SchemaAttr; use meilidb_schema::SchemaAttr;
use serde_json::Error as SerdeJsonError;
use serde::ser; use serde::ser;
use serde_json::Error as SerdeJsonError;
use crate::{DocumentId, ParseNumberError}; use crate::{DocumentId, ParseNumberError};
@ -55,24 +55,24 @@ impl fmt::Display for SerializerError {
match self { match self {
SerializerError::DocumentIdNotFound => { SerializerError::DocumentIdNotFound => {
f.write_str("serialized document does not have an id according to the schema") f.write_str("serialized document does not have an id according to the schema")
}, }
SerializerError::InvalidDocumentIdType => { SerializerError::InvalidDocumentIdType => {
f.write_str("document identifier can only be of type string or number") f.write_str("document identifier can only be of type string or number")
}, }
SerializerError::Zlmdb(e) => write!(f, "zlmdb related error: {}", e), SerializerError::Zlmdb(e) => write!(f, "zlmdb related error: {}", e),
SerializerError::SerdeJson(e) => write!(f, "serde json error: {}", e), SerializerError::SerdeJson(e) => write!(f, "serde json error: {}", e),
SerializerError::ParseNumber(e) => { SerializerError::ParseNumber(e) => {
write!(f, "error while trying to parse a number: {}", e) write!(f, "error while trying to parse a number: {}", e)
}, }
SerializerError::UnserializableType { type_name } => { SerializerError::UnserializableType { type_name } => {
write!(f, "{} is not a serializable type", type_name) write!(f, "{} is not a serializable type", type_name)
}, }
SerializerError::UnindexableType { type_name } => { SerializerError::UnindexableType { type_name } => {
write!(f, "{} is not an indexable type", type_name) write!(f, "{} is not an indexable type", type_name)
}, }
SerializerError::UnrankableType { type_name } => { SerializerError::UnrankableType { type_name } => {
write!(f, "{} types can not be used for ranking", type_name) write!(f, "{} types can not be used for ranking", type_name)
}, }
SerializerError::Custom(s) => f.write_str(s), SerializerError::Custom(s) => f.write_str(s),
} }
} }
@ -119,3 +119,9 @@ impl RamDocumentStore {
self.0 self.0
} }
} }
impl Default for RamDocumentStore {
fn default() -> Self {
Self::new()
}
}

View File

@ -1,12 +1,12 @@
use std::collections::HashMap;
use meilidb_schema::{Schema, SchemaAttr}; use meilidb_schema::{Schema, SchemaAttr};
use serde::ser; use serde::ser;
use std::collections::HashMap;
use crate::{DocumentId, RankedMap};
use crate::raw_indexer::RawIndexer; use crate::raw_indexer::RawIndexer;
use crate::serde::RamDocumentStore; use crate::serde::RamDocumentStore;
use crate::{DocumentId, RankedMap};
use super::{SerializerError, ConvertToString, ConvertToNumber, Indexer}; use super::{ConvertToNumber, ConvertToString, Indexer, SerializerError};
pub struct Serializer<'a> { pub struct Serializer<'a> {
pub schema: &'a Schema, pub schema: &'a Schema,
@ -55,13 +55,18 @@ impl<'a> ser::Serializer for Serializer<'a> {
} }
fn serialize_none(self) -> Result<Self::Ok, Self::Error> { fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "Option" }) Err(SerializerError::UnserializableType {
type_name: "Option",
})
} }
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error> fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize, where
T: ser::Serialize,
{ {
Err(SerializerError::UnserializableType { type_name: "Option" }) Err(SerializerError::UnserializableType {
type_name: "Option",
})
} }
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> { fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
@ -69,25 +74,29 @@ impl<'a> ser::Serializer for Serializer<'a> {
} }
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> { fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "unit struct" }) Err(SerializerError::UnserializableType {
type_name: "unit struct",
})
} }
fn serialize_unit_variant( fn serialize_unit_variant(
self, self,
_name: &'static str, _name: &'static str,
_variant_index: u32, _variant_index: u32,
_variant: &'static str _variant: &'static str,
) -> Result<Self::Ok, Self::Error> ) -> Result<Self::Ok, Self::Error> {
{ Err(SerializerError::UnserializableType {
Err(SerializerError::UnserializableType { type_name: "unit variant" }) type_name: "unit variant",
})
} }
fn serialize_newtype_struct<T: ?Sized>( fn serialize_newtype_struct<T: ?Sized>(
self, self,
_name: &'static str, _name: &'static str,
value: &T value: &T,
) -> Result<Self::Ok, Self::Error> ) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize, where
T: ser::Serialize,
{ {
value.serialize(self) value.serialize(self)
} }
@ -97,15 +106,20 @@ impl<'a> ser::Serializer for Serializer<'a> {
_name: &'static str, _name: &'static str,
_variant_index: u32, _variant_index: u32,
_variant: &'static str, _variant: &'static str,
_value: &T _value: &T,
) -> Result<Self::Ok, Self::Error> ) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize, where
T: ser::Serialize,
{ {
Err(SerializerError::UnserializableType { type_name: "newtype variant" }) Err(SerializerError::UnserializableType {
type_name: "newtype variant",
})
} }
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> { fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "sequence" }) Err(SerializerError::UnserializableType {
type_name: "sequence",
})
} }
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> { fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
@ -115,10 +129,11 @@ impl<'a> ser::Serializer for Serializer<'a> {
fn serialize_tuple_struct( fn serialize_tuple_struct(
self, self,
_name: &'static str, _name: &'static str,
_len: usize _len: usize,
) -> Result<Self::SerializeTupleStruct, Self::Error> ) -> Result<Self::SerializeTupleStruct, Self::Error> {
{ Err(SerializerError::UnserializableType {
Err(SerializerError::UnserializableType { type_name: "tuple struct" }) type_name: "tuple struct",
})
} }
fn serialize_tuple_variant( fn serialize_tuple_variant(
@ -126,10 +141,11 @@ impl<'a> ser::Serializer for Serializer<'a> {
_name: &'static str, _name: &'static str,
_variant_index: u32, _variant_index: u32,
_variant: &'static str, _variant: &'static str,
_len: usize _len: usize,
) -> Result<Self::SerializeTupleVariant, Self::Error> ) -> Result<Self::SerializeTupleVariant, Self::Error> {
{ Err(SerializerError::UnserializableType {
Err(SerializerError::UnserializableType { type_name: "tuple variant" }) type_name: "tuple variant",
})
} }
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> { fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
@ -147,9 +163,8 @@ impl<'a> ser::Serializer for Serializer<'a> {
fn serialize_struct( fn serialize_struct(
self, self,
_name: &'static str, _name: &'static str,
_len: usize _len: usize,
) -> Result<Self::SerializeStruct, Self::Error> ) -> Result<Self::SerializeStruct, Self::Error> {
{
Ok(StructSerializer { Ok(StructSerializer {
schema: self.schema, schema: self.schema,
document_id: self.document_id, document_id: self.document_id,
@ -165,10 +180,11 @@ impl<'a> ser::Serializer for Serializer<'a> {
_name: &'static str, _name: &'static str,
_variant_index: u32, _variant_index: u32,
_variant: &'static str, _variant: &'static str,
_len: usize _len: usize,
) -> Result<Self::SerializeStructVariant, Self::Error> ) -> Result<Self::SerializeStructVariant, Self::Error> {
{ Err(SerializerError::UnserializableType {
Err(SerializerError::UnserializableType { type_name: "struct variant" }) type_name: "struct variant",
})
} }
} }
@ -187,7 +203,8 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> {
type Error = SerializerError; type Error = SerializerError;
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error> fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
where T: ser::Serialize, where
T: ser::Serialize,
{ {
let key = key.serialize(ConvertToString)?; let key = key.serialize(ConvertToString)?;
self.current_key_name = Some(key); self.current_key_name = Some(key);
@ -195,7 +212,8 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> {
} }
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error> fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: ser::Serialize, where
T: ser::Serialize,
{ {
let key = self.current_key_name.take().unwrap(); let key = self.current_key_name.take().unwrap();
self.serialize_entry(&key, value) self.serialize_entry(&key, value)
@ -206,7 +224,9 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> {
key: &K, key: &K,
value: &V, value: &V,
) -> Result<(), Self::Error> ) -> Result<(), Self::Error>
where K: ser::Serialize, V: ser::Serialize, where
K: ser::Serialize,
V: ser::Serialize,
{ {
let key = key.serialize(ConvertToString)?; let key = key.serialize(ConvertToString)?;
@ -245,7 +265,8 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
key: &'static str, key: &'static str,
value: &T, value: &T,
) -> Result<(), Self::Error> ) -> Result<(), Self::Error>
where T: ser::Serialize, where
T: ser::Serialize,
{ {
serialize_value( serialize_value(
self.schema, self.schema,
@ -274,7 +295,8 @@ fn serialize_value<T: ?Sized>(
key: &str, key: &str,
value: &T, value: &T,
) -> Result<(), SerializerError> ) -> Result<(), SerializerError>
where T: ser::Serialize, where
T: ser::Serialize,
{ {
if let Some(attribute) = schema.attribute(key) { if let Some(attribute) = schema.attribute(key) {
let props = schema.props(attribute); let props = schema.props(attribute);
@ -283,7 +305,11 @@ where T: ser::Serialize,
document_store.set_document_field(document_id, attribute, serialized); document_store.set_document_field(document_id, attribute, serialized);
if props.is_indexed() { if props.is_indexed() {
let indexer = Indexer { attribute, indexer, document_id }; let indexer = Indexer {
attribute,
indexer,
document_id,
};
if let Some(number_of_words) = value.serialize(indexer)? { if let Some(number_of_words) = value.serialize(indexer)? {
documents_fields_counts.insert((document_id, attribute), number_of_words as u64); documents_fields_counts.insert((document_id, attribute), number_of_words as u64);
} }

View File

@ -1,8 +1,8 @@
use std::sync::Arc;
use zlmdb::types::{OwnedType, ByteSlice};
use zlmdb::Result as ZResult;
use crate::DocumentId;
use super::BEU64; use super::BEU64;
use crate::DocumentId;
use std::sync::Arc;
use zlmdb::types::{ByteSlice, OwnedType};
use zlmdb::Result as ZResult;
#[derive(Copy, Clone)] #[derive(Copy, Clone)]
pub struct DocsWords { pub struct DocsWords {
@ -11,33 +11,30 @@ pub struct DocsWords {
impl DocsWords { impl DocsWords {
pub fn put_doc_words( pub fn put_doc_words(
&self, self,
writer: &mut zlmdb::RwTxn, writer: &mut zlmdb::RwTxn,
document_id: DocumentId, document_id: DocumentId,
words: &fst::Set, words: &fst::Set,
) -> ZResult<()> ) -> ZResult<()> {
{
let document_id = BEU64::new(document_id.0); let document_id = BEU64::new(document_id.0);
let bytes = words.as_fst().as_bytes(); let bytes = words.as_fst().as_bytes();
self.docs_words.put(writer, &document_id, bytes) self.docs_words.put(writer, &document_id, bytes)
} }
pub fn del_doc_words( pub fn del_doc_words(
&self, self,
writer: &mut zlmdb::RwTxn, writer: &mut zlmdb::RwTxn,
document_id: DocumentId, document_id: DocumentId,
) -> ZResult<bool> ) -> ZResult<bool> {
{
let document_id = BEU64::new(document_id.0); let document_id = BEU64::new(document_id.0);
self.docs_words.delete(writer, &document_id) self.docs_words.delete(writer, &document_id)
} }
pub fn doc_words( pub fn doc_words(
&self, self,
reader: &zlmdb::RoTxn, reader: &zlmdb::RoTxn,
document_id: DocumentId, document_id: DocumentId,
) -> ZResult<Option<fst::Set>> ) -> ZResult<Option<fst::Set>> {
{
let document_id = BEU64::new(document_id.0); let document_id = BEU64::new(document_id.0);
match self.docs_words.get(reader, &document_id)? { match self.docs_words.get(reader, &document_id)? {
Some(bytes) => { Some(bytes) => {
@ -45,7 +42,7 @@ impl DocsWords {
let bytes = Arc::from(bytes); let bytes = Arc::from(bytes);
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap(); let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
Ok(Some(fst::Set::from(fst))) Ok(Some(fst::Set::from(fst)))
}, }
None => Ok(None), None => Ok(None),
} }
} }

View File

@ -1,9 +1,9 @@
use meilidb_schema::SchemaAttr; use meilidb_schema::SchemaAttr;
use zlmdb::types::{OwnedType, ByteSlice}; use zlmdb::types::{ByteSlice, OwnedType};
use zlmdb::Result as ZResult; use zlmdb::Result as ZResult;
use crate::DocumentId;
use super::DocumentAttrKey; use super::DocumentAttrKey;
use crate::DocumentId;
#[derive(Copy, Clone)] #[derive(Copy, Clone)]
pub struct DocumentsFields { pub struct DocumentsFields {
@ -12,45 +12,41 @@ pub struct DocumentsFields {
impl DocumentsFields { impl DocumentsFields {
pub fn put_document_field( pub fn put_document_field(
&self, self,
writer: &mut zlmdb::RwTxn, writer: &mut zlmdb::RwTxn,
document_id: DocumentId, document_id: DocumentId,
attribute: SchemaAttr, attribute: SchemaAttr,
value: &[u8], value: &[u8],
) -> ZResult<()> ) -> ZResult<()> {
{
let key = DocumentAttrKey::new(document_id, attribute); let key = DocumentAttrKey::new(document_id, attribute);
self.documents_fields.put(writer, &key, value) self.documents_fields.put(writer, &key, value)
} }
pub fn del_all_document_fields( pub fn del_all_document_fields(
&self, self,
writer: &mut zlmdb::RwTxn, writer: &mut zlmdb::RwTxn,
document_id: DocumentId, document_id: DocumentId,
) -> ZResult<usize> ) -> ZResult<usize> {
{
let start = DocumentAttrKey::new(document_id, SchemaAttr::min()); let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
let end = DocumentAttrKey::new(document_id, SchemaAttr::max()); let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
self.documents_fields.delete_range(writer, start..=end) self.documents_fields.delete_range(writer, start..=end)
} }
pub fn document_attribute<'txn>( pub fn document_attribute<'txn>(
&self, self,
reader: &'txn zlmdb::RoTxn, reader: &'txn zlmdb::RoTxn,
document_id: DocumentId, document_id: DocumentId,
attribute: SchemaAttr, attribute: SchemaAttr,
) -> ZResult<Option<&'txn [u8]>> ) -> ZResult<Option<&'txn [u8]>> {
{
let key = DocumentAttrKey::new(document_id, attribute); let key = DocumentAttrKey::new(document_id, attribute);
self.documents_fields.get(reader, &key) self.documents_fields.get(reader, &key)
} }
pub fn document_fields<'txn>( pub fn document_fields<'txn>(
&self, self,
reader: &'txn zlmdb::RoTxn, reader: &'txn zlmdb::RoTxn,
document_id: DocumentId, document_id: DocumentId,
) -> ZResult<DocumentFieldsIter<'txn>> ) -> ZResult<DocumentFieldsIter<'txn>> {
{
let start = DocumentAttrKey::new(document_id, SchemaAttr::min()); let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
let end = DocumentAttrKey::new(document_id, SchemaAttr::max()); let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
let iter = self.documents_fields.range(reader, start..=end)?; let iter = self.documents_fields.range(reader, start..=end)?;
@ -70,8 +66,8 @@ impl<'txn> Iterator for DocumentFieldsIter<'txn> {
Some(Ok((key, bytes))) => { Some(Ok((key, bytes))) => {
let attr = SchemaAttr(key.attr.get()); let attr = SchemaAttr(key.attr.get());
Some(Ok((attr, bytes))) Some(Ok((attr, bytes)))
}, }
Some(Err(e)) => Some(Err(e.into())), Some(Err(e)) => Some(Err(e)),
None => None, None => None,
} }
} }

View File

@ -1,8 +1,8 @@
use super::DocumentAttrKey;
use crate::DocumentId;
use meilidb_schema::SchemaAttr; use meilidb_schema::SchemaAttr;
use zlmdb::types::OwnedType; use zlmdb::types::OwnedType;
use zlmdb::Result as ZResult; use zlmdb::Result as ZResult;
use crate::DocumentId;
use super::DocumentAttrKey;
#[derive(Copy, Clone)] #[derive(Copy, Clone)]
pub struct DocumentsFieldsCounts { pub struct DocumentsFieldsCounts {
@ -11,35 +11,33 @@ pub struct DocumentsFieldsCounts {
impl DocumentsFieldsCounts { impl DocumentsFieldsCounts {
pub fn put_document_field_count( pub fn put_document_field_count(
&self, self,
writer: &mut zlmdb::RwTxn, writer: &mut zlmdb::RwTxn,
document_id: DocumentId, document_id: DocumentId,
attribute: SchemaAttr, attribute: SchemaAttr,
value: u64, value: u64,
) -> ZResult<()> ) -> ZResult<()> {
{
let key = DocumentAttrKey::new(document_id, attribute); let key = DocumentAttrKey::new(document_id, attribute);
self.documents_fields_counts.put(writer, &key, &value) self.documents_fields_counts.put(writer, &key, &value)
} }
pub fn del_all_document_fields_counts( pub fn del_all_document_fields_counts(
&self, self,
writer: &mut zlmdb::RwTxn, writer: &mut zlmdb::RwTxn,
document_id: DocumentId, document_id: DocumentId,
) -> ZResult<usize> ) -> ZResult<usize> {
{
let start = DocumentAttrKey::new(document_id, SchemaAttr::min()); let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
let end = DocumentAttrKey::new(document_id, SchemaAttr::max()); let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
self.documents_fields_counts.delete_range(writer, start..=end) self.documents_fields_counts
.delete_range(writer, start..=end)
} }
pub fn document_field_count( pub fn document_field_count(
&self, self,
reader: &zlmdb::RoTxn, reader: &zlmdb::RoTxn,
document_id: DocumentId, document_id: DocumentId,
attribute: SchemaAttr, attribute: SchemaAttr,
) -> ZResult<Option<u64>> ) -> ZResult<Option<u64>> {
{
let key = DocumentAttrKey::new(document_id, attribute); let key = DocumentAttrKey::new(document_id, attribute);
match self.documents_fields_counts.get(reader, &key)? { match self.documents_fields_counts.get(reader, &key)? {
Some(count) => Ok(Some(count)), Some(count) => Ok(Some(count)),
@ -48,11 +46,10 @@ impl DocumentsFieldsCounts {
} }
pub fn document_fields_counts<'txn>( pub fn document_fields_counts<'txn>(
&self, self,
reader: &'txn zlmdb::RoTxn, reader: &'txn zlmdb::RoTxn,
document_id: DocumentId, document_id: DocumentId,
) -> ZResult<DocumentFieldsCountsIter<'txn>> ) -> ZResult<DocumentFieldsCountsIter<'txn>> {
{
let start = DocumentAttrKey::new(document_id, SchemaAttr::min()); let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
let end = DocumentAttrKey::new(document_id, SchemaAttr::max()); let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
let iter = self.documents_fields_counts.range(reader, start..=end)?; let iter = self.documents_fields_counts.range(reader, start..=end)?;
@ -60,19 +57,20 @@ impl DocumentsFieldsCounts {
} }
pub fn documents_ids<'txn>( pub fn documents_ids<'txn>(
&self, self,
reader: &'txn zlmdb::RoTxn, reader: &'txn zlmdb::RoTxn,
) -> ZResult<DocumentsIdsIter<'txn>> ) -> ZResult<DocumentsIdsIter<'txn>> {
{
let iter = self.documents_fields_counts.iter(reader)?; let iter = self.documents_fields_counts.iter(reader)?;
Ok(DocumentsIdsIter { last_seen_id: None, iter }) Ok(DocumentsIdsIter {
last_seen_id: None,
iter,
})
} }
pub fn all_documents_fields_counts<'txn>( pub fn all_documents_fields_counts<'txn>(
&self, self,
reader: &'txn zlmdb::RoTxn, reader: &'txn zlmdb::RoTxn,
) -> ZResult<AllDocumentsFieldsCountsIter<'txn>> ) -> ZResult<AllDocumentsFieldsCountsIter<'txn>> {
{
let iter = self.documents_fields_counts.iter(reader)?; let iter = self.documents_fields_counts.iter(reader)?;
Ok(AllDocumentsFieldsCountsIter { iter }) Ok(AllDocumentsFieldsCountsIter { iter })
} }
@ -90,8 +88,8 @@ impl Iterator for DocumentFieldsCountsIter<'_> {
Some(Ok((key, count))) => { Some(Ok((key, count))) => {
let attr = SchemaAttr(key.attr.get()); let attr = SchemaAttr(key.attr.get());
Some(Ok((attr, count))) Some(Ok((attr, count)))
}, }
Some(Err(e)) => Some(Err(e.into())), Some(Err(e)) => Some(Err(e)),
None => None, None => None,
} }
} }
@ -112,10 +110,10 @@ impl Iterator for DocumentsIdsIter<'_> {
let document_id = DocumentId(key.docid.get()); let document_id = DocumentId(key.docid.get());
if Some(document_id) != self.last_seen_id { if Some(document_id) != self.last_seen_id {
self.last_seen_id = Some(document_id); self.last_seen_id = Some(document_id);
return Some(Ok(document_id)) return Some(Ok(document_id));
} }
}, }
Err(e) => return Some(Err(e.into())), Err(e) => return Some(Err(e)),
} }
} }
None None
@ -135,8 +133,8 @@ impl<'r> Iterator for AllDocumentsFieldsCountsIter<'r> {
let docid = DocumentId(key.docid.get()); let docid = DocumentId(key.docid.get());
let attr = SchemaAttr(key.attr.get()); let attr = SchemaAttr(key.attr.get());
Some(Ok((docid, attr, count))) Some(Ok((docid, attr, count)))
}, }
Some(Err(e)) => Some(Err(e.into())), Some(Err(e)) => Some(Err(e)),
None => None, None => None,
} }
} }

View File

@ -1,15 +1,15 @@
use std::sync::Arc;
use meilidb_schema::Schema;
use zlmdb::types::{Str, OwnedType, ByteSlice, Serde};
use zlmdb::Result as ZResult;
use crate::RankedMap; use crate::RankedMap;
use meilidb_schema::Schema;
use std::sync::Arc;
use zlmdb::types::{ByteSlice, OwnedType, Serde, Str};
use zlmdb::Result as ZResult;
const CUSTOMS_KEY: &str = "customs-key"; const CUSTOMS_KEY: &str = "customs-key";
const NUMBER_OF_DOCUMENTS_KEY: &str = "number-of-documents"; const NUMBER_OF_DOCUMENTS_KEY: &str = "number-of-documents";
const RANKED_MAP_KEY: &str = "ranked-map"; const RANKED_MAP_KEY: &str = "ranked-map";
const SCHEMA_KEY: &str = "schema"; const SCHEMA_KEY: &str = "schema";
const SYNONYMS_KEY: &str = "synonyms"; const SYNONYMS_KEY: &str = "synonyms";
const WORDS_KEY: &str = "words"; const WORDS_KEY: &str = "words";
#[derive(Copy, Clone)] #[derive(Copy, Clone)]
pub struct Main { pub struct Main {
@ -17,76 +17,85 @@ pub struct Main {
} }
impl Main { impl Main {
pub fn put_words_fst(&self, writer: &mut zlmdb::RwTxn, fst: &fst::Set) -> ZResult<()> { pub fn put_words_fst(self, writer: &mut zlmdb::RwTxn, fst: &fst::Set) -> ZResult<()> {
let bytes = fst.as_fst().as_bytes(); let bytes = fst.as_fst().as_bytes();
self.main.put::<Str, ByteSlice>(writer, WORDS_KEY, bytes) self.main.put::<Str, ByteSlice>(writer, WORDS_KEY, bytes)
} }
pub fn words_fst(&self, reader: &zlmdb::RoTxn) -> ZResult<Option<fst::Set>> { pub fn words_fst(self, reader: &zlmdb::RoTxn) -> ZResult<Option<fst::Set>> {
match self.main.get::<Str, ByteSlice>(reader, WORDS_KEY)? { match self.main.get::<Str, ByteSlice>(reader, WORDS_KEY)? {
Some(bytes) => { Some(bytes) => {
let len = bytes.len(); let len = bytes.len();
let bytes = Arc::from(bytes); let bytes = Arc::from(bytes);
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap(); let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
Ok(Some(fst::Set::from(fst))) Ok(Some(fst::Set::from(fst)))
}, }
None => Ok(None), None => Ok(None),
} }
} }
pub fn put_schema(&self, writer: &mut zlmdb::RwTxn, schema: &Schema) -> ZResult<()> { pub fn put_schema(self, writer: &mut zlmdb::RwTxn, schema: &Schema) -> ZResult<()> {
self.main.put::<Str, Serde<Schema>>(writer, SCHEMA_KEY, schema) self.main
.put::<Str, Serde<Schema>>(writer, SCHEMA_KEY, schema)
} }
pub fn schema(&self, reader: &zlmdb::RoTxn) -> ZResult<Option<Schema>> { pub fn schema(self, reader: &zlmdb::RoTxn) -> ZResult<Option<Schema>> {
self.main.get::<Str, Serde<Schema>>(reader, SCHEMA_KEY) self.main.get::<Str, Serde<Schema>>(reader, SCHEMA_KEY)
} }
pub fn put_ranked_map(&self, writer: &mut zlmdb::RwTxn, ranked_map: &RankedMap) -> ZResult<()> { pub fn put_ranked_map(self, writer: &mut zlmdb::RwTxn, ranked_map: &RankedMap) -> ZResult<()> {
self.main.put::<Str, Serde<RankedMap>>(writer, RANKED_MAP_KEY, &ranked_map) self.main
.put::<Str, Serde<RankedMap>>(writer, RANKED_MAP_KEY, &ranked_map)
} }
pub fn ranked_map(&self, reader: &zlmdb::RoTxn) -> ZResult<Option<RankedMap>> { pub fn ranked_map(self, reader: &zlmdb::RoTxn) -> ZResult<Option<RankedMap>> {
self.main.get::<Str, Serde<RankedMap>>(reader, RANKED_MAP_KEY) self.main
.get::<Str, Serde<RankedMap>>(reader, RANKED_MAP_KEY)
} }
pub fn put_synonyms_fst(&self, writer: &mut zlmdb::RwTxn, fst: &fst::Set) -> ZResult<()> { pub fn put_synonyms_fst(self, writer: &mut zlmdb::RwTxn, fst: &fst::Set) -> ZResult<()> {
let bytes = fst.as_fst().as_bytes(); let bytes = fst.as_fst().as_bytes();
self.main.put::<Str, ByteSlice>(writer, SYNONYMS_KEY, bytes) self.main.put::<Str, ByteSlice>(writer, SYNONYMS_KEY, bytes)
} }
pub fn synonyms_fst(&self, reader: &zlmdb::RoTxn) -> ZResult<Option<fst::Set>> { pub fn synonyms_fst(self, reader: &zlmdb::RoTxn) -> ZResult<Option<fst::Set>> {
match self.main.get::<Str, ByteSlice>(reader, SYNONYMS_KEY)? { match self.main.get::<Str, ByteSlice>(reader, SYNONYMS_KEY)? {
Some(bytes) => { Some(bytes) => {
let len = bytes.len(); let len = bytes.len();
let bytes = Arc::from(bytes); let bytes = Arc::from(bytes);
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap(); let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
Ok(Some(fst::Set::from(fst))) Ok(Some(fst::Set::from(fst)))
}, }
None => Ok(None), None => Ok(None),
} }
} }
pub fn put_number_of_documents<F>(&self, writer: &mut zlmdb::RwTxn, f: F) -> ZResult<u64> pub fn put_number_of_documents<F>(self, writer: &mut zlmdb::RwTxn, f: F) -> ZResult<u64>
where F: Fn(u64) -> u64, where
F: Fn(u64) -> u64,
{ {
let new = self.number_of_documents(writer).map(f)?; let new = self.number_of_documents(writer).map(f)?;
self.main.put::<Str, OwnedType<u64>>(writer, NUMBER_OF_DOCUMENTS_KEY, &new)?; self.main
.put::<Str, OwnedType<u64>>(writer, NUMBER_OF_DOCUMENTS_KEY, &new)?;
Ok(new) Ok(new)
} }
pub fn number_of_documents(&self, reader: &zlmdb::RoTxn) -> ZResult<u64> { pub fn number_of_documents(self, reader: &zlmdb::RoTxn) -> ZResult<u64> {
match self.main.get::<Str, OwnedType<u64>>(reader, NUMBER_OF_DOCUMENTS_KEY)? { match self
.main
.get::<Str, OwnedType<u64>>(reader, NUMBER_OF_DOCUMENTS_KEY)?
{
Some(value) => Ok(value), Some(value) => Ok(value),
None => Ok(0), None => Ok(0),
} }
} }
pub fn put_customs(&self, writer: &mut zlmdb::RwTxn, customs: &[u8]) -> ZResult<()> { pub fn put_customs(self, writer: &mut zlmdb::RwTxn, customs: &[u8]) -> ZResult<()> {
self.main.put::<Str, ByteSlice>(writer, CUSTOMS_KEY, customs) self.main
.put::<Str, ByteSlice>(writer, CUSTOMS_KEY, customs)
} }
pub fn customs<'txn>(&self, reader: &'txn zlmdb::RoTxn) -> ZResult<Option<&'txn [u8]>> { pub fn customs<'txn>(self, reader: &'txn zlmdb::RoTxn) -> ZResult<Option<&'txn [u8]>> {
self.main.get::<Str, ByteSlice>(reader, CUSTOMS_KEY) self.main.get::<Str, ByteSlice>(reader, CUSTOMS_KEY)
} }
} }

View File

@ -8,8 +8,10 @@ mod updates;
mod updates_results; mod updates_results;
pub use self::docs_words::DocsWords; pub use self::docs_words::DocsWords;
pub use self::documents_fields::{DocumentsFields, DocumentFieldsIter}; pub use self::documents_fields::{DocumentFieldsIter, DocumentsFields};
pub use self::documents_fields_counts::{DocumentsFieldsCounts, DocumentFieldsCountsIter, DocumentsIdsIter}; pub use self::documents_fields_counts::{
DocumentFieldsCountsIter, DocumentsFieldsCounts, DocumentsIdsIter,
};
pub use self::main::Main; pub use self::main::Main;
pub use self::postings_lists::PostingsLists; pub use self::postings_lists::PostingsLists;
pub use self::synonyms::Synonyms; pub use self::synonyms::Synonyms;
@ -25,19 +27,24 @@ use zlmdb::Result as ZResult;
use crate::criterion::Criteria; use crate::criterion::Criteria;
use crate::serde::Deserializer; use crate::serde::Deserializer;
use crate::{update, query_builder::QueryBuilder, DocumentId, MResult, Error}; use crate::{query_builder::QueryBuilder, update, DocumentId, Error, MResult};
type BEU64 = zerocopy::U64<byteorder::BigEndian>; type BEU64 = zerocopy::U64<byteorder::BigEndian>;
type BEU16 = zerocopy::U16<byteorder::BigEndian>; type BEU16 = zerocopy::U16<byteorder::BigEndian>;
#[derive(Debug, Copy, Clone)] #[derive(Debug, Copy, Clone, AsBytes, FromBytes)]
#[derive(AsBytes, FromBytes)]
#[repr(C)] #[repr(C)]
pub struct DocumentAttrKey { docid: BEU64, attr: BEU16 } pub struct DocumentAttrKey {
docid: BEU64,
attr: BEU16,
}
impl DocumentAttrKey { impl DocumentAttrKey {
fn new(docid: DocumentId, attr: SchemaAttr) -> DocumentAttrKey { fn new(docid: DocumentId, attr: SchemaAttr) -> DocumentAttrKey {
DocumentAttrKey { docid: BEU64::new(docid.0), attr: BEU16::new(attr.0) } DocumentAttrKey {
docid: BEU64::new(docid.0),
attr: BEU16::new(attr.0),
}
} }
} }
@ -93,13 +100,15 @@ impl Index {
reader: &zlmdb::RoTxn, reader: &zlmdb::RoTxn,
attributes: Option<&HashSet<&str>>, attributes: Option<&HashSet<&str>>,
document_id: DocumentId, document_id: DocumentId,
) -> MResult<Option<T>> ) -> MResult<Option<T>> {
{
let schema = self.main.schema(reader)?; let schema = self.main.schema(reader)?;
let schema = schema.ok_or(Error::SchemaMissing)?; let schema = schema.ok_or(Error::SchemaMissing)?;
let attributes = match attributes { let attributes = match attributes {
Some(attributes) => attributes.into_iter().map(|name| schema.attribute(name)).collect(), Some(attributes) => attributes
.iter()
.map(|name| schema.attribute(name))
.collect(),
None => None, None => None,
}; };
@ -121,9 +130,10 @@ impl Index {
reader: &zlmdb::RoTxn, reader: &zlmdb::RoTxn,
document_id: DocumentId, document_id: DocumentId,
attribute: SchemaAttr, attribute: SchemaAttr,
) -> MResult<Option<T>> ) -> MResult<Option<T>> {
{ let bytes = self
let bytes = self.documents_fields.document_attribute(reader, document_id, attribute)?; .documents_fields
.document_attribute(reader, document_id, attribute)?;
match bytes { match bytes {
Some(bytes) => Ok(Some(serde_json::from_slice(bytes)?)), Some(bytes) => Ok(Some(serde_json::from_slice(bytes)?)),
None => Ok(None), None => Ok(None),
@ -183,14 +193,8 @@ impl Index {
&self, &self,
reader: &zlmdb::RoTxn, reader: &zlmdb::RoTxn,
update_id: u64, update_id: u64,
) -> MResult<update::UpdateStatus> ) -> MResult<update::UpdateStatus> {
{ update::update_status(reader, self.updates, self.updates_results, update_id)
update::update_status(
reader,
self.updates,
self.updates_results,
update_id,
)
} }
pub fn query_builder(&self) -> QueryBuilder { pub fn query_builder(&self) -> QueryBuilder {
@ -205,8 +209,7 @@ impl Index {
pub fn query_builder_with_criteria<'c, 'f, 'd>( pub fn query_builder_with_criteria<'c, 'f, 'd>(
&self, &self,
criteria: Criteria<'c>, criteria: Criteria<'c>,
) -> QueryBuilder<'c, 'f, 'd> ) -> QueryBuilder<'c, 'f, 'd> {
{
QueryBuilder::with_criteria( QueryBuilder::with_criteria(
self.main, self.main,
self.postings_lists, self.postings_lists,
@ -221,8 +224,7 @@ pub fn create(
env: &zlmdb::Env, env: &zlmdb::Env,
name: &str, name: &str,
updates_notifier: crossbeam_channel::Sender<()>, updates_notifier: crossbeam_channel::Sender<()>,
) -> MResult<Index> ) -> MResult<Index> {
{
// create all the store names // create all the store names
let main_name = main_name(name); let main_name = main_name(name);
let postings_lists_name = postings_lists_name(name); let postings_lists_name = postings_lists_name(name);
@ -247,7 +249,9 @@ pub fn create(
main: Main { main }, main: Main { main },
postings_lists: PostingsLists { postings_lists }, postings_lists: PostingsLists { postings_lists },
documents_fields: DocumentsFields { documents_fields }, documents_fields: DocumentsFields { documents_fields },
documents_fields_counts: DocumentsFieldsCounts { documents_fields_counts }, documents_fields_counts: DocumentsFieldsCounts {
documents_fields_counts,
},
synonyms: Synonyms { synonyms }, synonyms: Synonyms { synonyms },
docs_words: DocsWords { docs_words }, docs_words: DocsWords { docs_words },
updates: Updates { updates }, updates: Updates { updates },
@ -260,8 +264,7 @@ pub fn open(
env: &zlmdb::Env, env: &zlmdb::Env,
name: &str, name: &str,
updates_notifier: crossbeam_channel::Sender<()>, updates_notifier: crossbeam_channel::Sender<()>,
) -> MResult<Option<Index>> ) -> MResult<Option<Index>> {
{
// create all the store names // create all the store names
let main_name = main_name(name); let main_name = main_name(name);
let postings_lists_name = postings_lists_name(name); let postings_lists_name = postings_lists_name(name);
@ -310,7 +313,9 @@ pub fn open(
main: Main { main }, main: Main { main },
postings_lists: PostingsLists { postings_lists }, postings_lists: PostingsLists { postings_lists },
documents_fields: DocumentsFields { documents_fields }, documents_fields: DocumentsFields { documents_fields },
documents_fields_counts: DocumentsFieldsCounts { documents_fields_counts }, documents_fields_counts: DocumentsFieldsCounts {
documents_fields_counts,
},
synonyms: Synonyms { synonyms }, synonyms: Synonyms { synonyms },
docs_words: DocsWords { docs_words }, docs_words: DocsWords { docs_words },
updates: Updates { updates }, updates: Updates { updates },

View File

@ -1,8 +1,8 @@
use std::borrow::Cow; use crate::DocIndex;
use sdset::{Set, SetBuf}; use sdset::{Set, SetBuf};
use std::borrow::Cow;
use zlmdb::types::{ByteSlice, CowSlice}; use zlmdb::types::{ByteSlice, CowSlice};
use zlmdb::Result as ZResult; use zlmdb::Result as ZResult;
use crate::DocIndex;
#[derive(Copy, Clone)] #[derive(Copy, Clone)]
pub struct PostingsLists { pub struct PostingsLists {
@ -11,25 +11,23 @@ pub struct PostingsLists {
impl PostingsLists { impl PostingsLists {
pub fn put_postings_list( pub fn put_postings_list(
&self, self,
writer: &mut zlmdb::RwTxn, writer: &mut zlmdb::RwTxn,
word: &[u8], word: &[u8],
words_indexes: &Set<DocIndex>, words_indexes: &Set<DocIndex>,
) -> ZResult<()> ) -> ZResult<()> {
{
self.postings_lists.put(writer, word, words_indexes) self.postings_lists.put(writer, word, words_indexes)
} }
pub fn del_postings_list(&self, writer: &mut zlmdb::RwTxn, word: &[u8]) -> ZResult<bool> { pub fn del_postings_list(self, writer: &mut zlmdb::RwTxn, word: &[u8]) -> ZResult<bool> {
self.postings_lists.delete(writer, word) self.postings_lists.delete(writer, word)
} }
pub fn postings_list<'txn>( pub fn postings_list<'txn>(
&self, self,
reader: &'txn zlmdb::RoTxn, reader: &'txn zlmdb::RoTxn,
word: &[u8], word: &[u8],
) -> ZResult<Option<Cow<'txn, Set<DocIndex>>>> ) -> ZResult<Option<Cow<'txn, Set<DocIndex>>>> {
{
match self.postings_lists.get(reader, word)? { match self.postings_lists.get(reader, word)? {
Some(Cow::Borrowed(slice)) => Ok(Some(Cow::Borrowed(Set::new_unchecked(slice)))), Some(Cow::Borrowed(slice)) => Ok(Some(Cow::Borrowed(Set::new_unchecked(slice)))),
Some(Cow::Owned(vec)) => Ok(Some(Cow::Owned(SetBuf::new_unchecked(vec)))), Some(Cow::Owned(vec)) => Ok(Some(Cow::Owned(SetBuf::new_unchecked(vec)))),

View File

@ -9,28 +9,27 @@ pub struct Synonyms {
impl Synonyms { impl Synonyms {
pub fn put_synonyms( pub fn put_synonyms(
&self, self,
writer: &mut zlmdb::RwTxn, writer: &mut zlmdb::RwTxn,
word: &[u8], word: &[u8],
synonyms: &fst::Set, synonyms: &fst::Set,
) -> ZResult<()> ) -> ZResult<()> {
{
let bytes = synonyms.as_fst().as_bytes(); let bytes = synonyms.as_fst().as_bytes();
self.synonyms.put(writer, word, bytes) self.synonyms.put(writer, word, bytes)
} }
pub fn del_synonyms(&self, writer: &mut zlmdb::RwTxn, word: &[u8]) -> ZResult<bool> { pub fn del_synonyms(self, writer: &mut zlmdb::RwTxn, word: &[u8]) -> ZResult<bool> {
self.synonyms.delete(writer, word) self.synonyms.delete(writer, word)
} }
pub fn synonyms(&self, reader: &zlmdb::RoTxn, word: &[u8]) -> ZResult<Option<fst::Set>> { pub fn synonyms(self, reader: &zlmdb::RoTxn, word: &[u8]) -> ZResult<Option<fst::Set>> {
match self.synonyms.get(reader, word)? { match self.synonyms.get(reader, word)? {
Some(bytes) => { Some(bytes) => {
let len = bytes.len(); let len = bytes.len();
let bytes = Arc::from(bytes); let bytes = Arc::from(bytes);
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap(); let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
Ok(Some(fst::Set::from(fst))) Ok(Some(fst::Set::from(fst)))
}, }
None => Ok(None), None => Ok(None),
} }
} }

View File

@ -1,13 +1,16 @@
use super::BEU64;
use crate::update::Update;
use serde::{Deserialize, Serialize};
use std::borrow::Cow; use std::borrow::Cow;
use zlmdb::types::OwnedType; use zlmdb::types::OwnedType;
use zlmdb::{Result as ZResult, BytesEncode, BytesDecode}; use zlmdb::{BytesDecode, BytesEncode, Result as ZResult};
use serde::{Serialize, Deserialize};
use crate::update::Update;
use super::BEU64;
pub struct SerdeJson<T>(std::marker::PhantomData<T>); pub struct SerdeJson<T>(std::marker::PhantomData<T>);
impl<T> BytesEncode for SerdeJson<T> where T: Serialize { impl<T> BytesEncode for SerdeJson<T>
where
T: Serialize,
{
type EItem = T; type EItem = T;
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> { fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
@ -15,7 +18,10 @@ impl<T> BytesEncode for SerdeJson<T> where T: Serialize {
} }
} }
impl<'a, T: 'a> BytesDecode<'a> for SerdeJson<T> where T: Deserialize<'a> + Clone { impl<'a, T: 'a> BytesDecode<'a> for SerdeJson<T>
where
T: Deserialize<'a> + Clone,
{
type DItem = T; type DItem = T;
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
@ -30,7 +36,7 @@ pub struct Updates {
impl Updates { impl Updates {
// TODO do not trigger deserialize if possible // TODO do not trigger deserialize if possible
pub fn last_update_id(&self, reader: &zlmdb::RoTxn) -> ZResult<Option<(u64, Update)>> { pub fn last_update_id(self, reader: &zlmdb::RoTxn) -> ZResult<Option<(u64, Update)>> {
match self.updates.last(reader)? { match self.updates.last(reader)? {
Some((key, data)) => Ok(Some((key.get(), data))), Some((key, data)) => Ok(Some((key.get(), data))),
None => Ok(None), None => Ok(None),
@ -38,7 +44,7 @@ impl Updates {
} }
// TODO do not trigger deserialize if possible // TODO do not trigger deserialize if possible
fn first_update_id(&self, reader: &zlmdb::RoTxn) -> ZResult<Option<(u64, Update)>> { fn first_update_id(self, reader: &zlmdb::RoTxn) -> ZResult<Option<(u64, Update)>> {
match self.updates.first(reader)? { match self.updates.first(reader)? {
Some((key, data)) => Ok(Some((key.get(), data))), Some((key, data)) => Ok(Some((key.get(), data))),
None => Ok(None), None => Ok(None),
@ -46,31 +52,30 @@ impl Updates {
} }
// TODO do not trigger deserialize if possible // TODO do not trigger deserialize if possible
pub fn contains(&self, reader: &zlmdb::RoTxn, update_id: u64) -> ZResult<bool> { pub fn contains(self, reader: &zlmdb::RoTxn, update_id: u64) -> ZResult<bool> {
let update_id = BEU64::new(update_id); let update_id = BEU64::new(update_id);
self.updates.get(reader, &update_id).map(|v| v.is_some()) self.updates.get(reader, &update_id).map(|v| v.is_some())
} }
pub fn put_update( pub fn put_update(
&self, self,
writer: &mut zlmdb::RwTxn, writer: &mut zlmdb::RwTxn,
update_id: u64, update_id: u64,
update: &Update, update: &Update,
) -> ZResult<()> ) -> ZResult<()> {
{
// TODO prefer using serde_json? // TODO prefer using serde_json?
let update_id = BEU64::new(update_id); let update_id = BEU64::new(update_id);
self.updates.put(writer, &update_id, update) self.updates.put(writer, &update_id, update)
} }
pub fn pop_front(&self, writer: &mut zlmdb::RwTxn) -> ZResult<Option<(u64, Update)>> { pub fn pop_front(self, writer: &mut zlmdb::RwTxn) -> ZResult<Option<(u64, Update)>> {
match self.first_update_id(writer)? { match self.first_update_id(writer)? {
Some((update_id, update)) => { Some((update_id, update)) => {
let key = BEU64::new(update_id); let key = BEU64::new(update_id);
self.updates.delete(writer, &key)?; self.updates.delete(writer, &key)?;
Ok(Some((update_id, update))) Ok(Some((update_id, update)))
}, }
None => Ok(None) None => Ok(None),
} }
} }
} }

View File

@ -1,7 +1,7 @@
use super::BEU64;
use crate::update::UpdateResult;
use zlmdb::types::{OwnedType, Serde}; use zlmdb::types::{OwnedType, Serde};
use zlmdb::Result as ZResult; use zlmdb::Result as ZResult;
use crate::update::UpdateResult;
use super::BEU64;
#[derive(Copy, Clone)] #[derive(Copy, Clone)]
pub struct UpdatesResults { pub struct UpdatesResults {
@ -9,7 +9,7 @@ pub struct UpdatesResults {
} }
impl UpdatesResults { impl UpdatesResults {
pub fn last_update_id(&self, reader: &zlmdb::RoTxn) -> ZResult<Option<(u64, UpdateResult)>> { pub fn last_update_id(self, reader: &zlmdb::RoTxn) -> ZResult<Option<(u64, UpdateResult)>> {
match self.updates_results.last(reader)? { match self.updates_results.last(reader)? {
Some((key, data)) => Ok(Some((key.get(), data))), Some((key, data)) => Ok(Some((key.get(), data))),
None => Ok(None), None => Ok(None),
@ -17,22 +17,20 @@ impl UpdatesResults {
} }
pub fn put_update_result( pub fn put_update_result(
&self, self,
writer: &mut zlmdb::RwTxn, writer: &mut zlmdb::RwTxn,
update_id: u64, update_id: u64,
update_result: &UpdateResult, update_result: &UpdateResult,
) -> ZResult<()> ) -> ZResult<()> {
{
let update_id = BEU64::new(update_id); let update_id = BEU64::new(update_id);
self.updates_results.put(writer, &update_id, update_result) self.updates_results.put(writer, &update_id, update_result)
} }
pub fn update_result( pub fn update_result(
&self, self,
reader: &zlmdb::RoTxn, reader: &zlmdb::RoTxn,
update_id: u64, update_id: u64,
) -> ZResult<Option<UpdateResult>> ) -> ZResult<Option<UpdateResult>> {
{
let update_id = BEU64::new(update_id); let update_id = BEU64::new(update_id);
self.updates_results.get(reader, &update_id) self.updates_results.get(reader, &update_id)
} }

View File

@ -1,13 +1,12 @@
use zlmdb::Result as ZResult;
use crate::update::{Update, next_update_id};
use crate::store; use crate::store;
use crate::update::{next_update_id, Update};
use zlmdb::Result as ZResult;
pub fn apply_customs_update( pub fn apply_customs_update(
writer: &mut zlmdb::RwTxn, writer: &mut zlmdb::RwTxn,
main_store: store::Main, main_store: store::Main,
customs: &[u8], customs: &[u8],
) -> ZResult<()> ) -> ZResult<()> {
{
main_store.put_customs(writer, customs) main_store.put_customs(writer, customs)
} }
@ -16,8 +15,7 @@ pub fn push_customs_update(
updates_store: store::Updates, updates_store: store::Updates,
updates_results_store: store::UpdatesResults, updates_results_store: store::UpdatesResults,
customs: Vec<u8>, customs: Vec<u8>,
) -> ZResult<u64> ) -> ZResult<u64> {
{
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?; let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::Customs(customs); let update = Update::Customs(customs);

View File

@ -1,14 +1,14 @@
use std::collections::{HashMap, HashSet}; use std::collections::{HashMap, HashSet};
use fst::{SetBuilder, set::OpBuilder}; use fst::{set::OpBuilder, SetBuilder};
use sdset::{SetOperation, duo::Union}; use sdset::{duo::Union, SetOperation};
use serde::Serialize; use serde::Serialize;
use crate::raw_indexer::RawIndexer; use crate::raw_indexer::RawIndexer;
use crate::serde::{extract_document_id, Serializer, RamDocumentStore}; use crate::serde::{extract_document_id, RamDocumentStore, Serializer};
use crate::store; use crate::store;
use crate::update::{Update, next_update_id, apply_documents_deletion}; use crate::update::{apply_documents_deletion, next_update_id, Update};
use crate::{MResult, Error, RankedMap}; use crate::{Error, MResult, RankedMap};
pub struct DocumentsAddition<D> { pub struct DocumentsAddition<D> {
updates_store: store::Updates, updates_store: store::Updates,
@ -22,8 +22,7 @@ impl<D> DocumentsAddition<D> {
updates_store: store::Updates, updates_store: store::Updates,
updates_results_store: store::UpdatesResults, updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>, updates_notifier: crossbeam_channel::Sender<()>,
) -> DocumentsAddition<D> ) -> DocumentsAddition<D> {
{
DocumentsAddition { DocumentsAddition {
updates_store, updates_store,
updates_results_store, updates_results_store,
@ -37,7 +36,8 @@ impl<D> DocumentsAddition<D> {
} }
pub fn finalize(self, writer: &mut zlmdb::RwTxn) -> MResult<u64> pub fn finalize(self, writer: &mut zlmdb::RwTxn) -> MResult<u64>
where D: serde::Serialize where
D: serde::Serialize,
{ {
let _ = self.updates_notifier.send(()); let _ = self.updates_notifier.send(());
let update_id = push_documents_addition( let update_id = push_documents_addition(
@ -51,7 +51,7 @@ impl<D> DocumentsAddition<D> {
} }
impl<D> Extend<D> for DocumentsAddition<D> { impl<D> Extend<D> for DocumentsAddition<D> {
fn extend<T: IntoIterator<Item=D>>(&mut self, iter: T) { fn extend<T: IntoIterator<Item = D>>(&mut self, iter: T) {
self.documents.extend(iter) self.documents.extend(iter)
} }
} }
@ -61,8 +61,7 @@ pub fn push_documents_addition<D: serde::Serialize>(
updates_store: store::Updates, updates_store: store::Updates,
updates_results_store: store::UpdatesResults, updates_results_store: store::UpdatesResults,
addition: Vec<D>, addition: Vec<D>,
) -> MResult<u64> ) -> MResult<u64> {
{
let mut values = Vec::with_capacity(addition.len()); let mut values = Vec::with_capacity(addition.len());
for add in addition { for add in addition {
let vec = serde_json::to_vec(&add)?; let vec = serde_json::to_vec(&add)?;
@ -87,8 +86,7 @@ pub fn apply_documents_addition(
docs_words_store: store::DocsWords, docs_words_store: store::DocsWords,
mut ranked_map: RankedMap, mut ranked_map: RankedMap,
addition: Vec<serde_json::Value>, addition: Vec<serde_json::Value>,
) -> MResult<()> ) -> MResult<()> {
{
let mut document_ids = HashSet::new(); let mut document_ids = HashSet::new();
let mut document_store = RamDocumentStore::new(); let mut document_store = RamDocumentStore::new();
let mut document_fields_counts = HashMap::new(); let mut document_fields_counts = HashMap::new();
@ -182,7 +180,7 @@ pub fn apply_documents_addition(
.into_inner() .into_inner()
.and_then(fst::Set::from_bytes) .and_then(fst::Set::from_bytes)
.unwrap() .unwrap()
}, }
None => delta_words, None => delta_words,
}; };

View File

@ -1,13 +1,13 @@
use std::collections::{HashMap, HashSet, BTreeSet}; use std::collections::{BTreeSet, HashMap, HashSet};
use fst::{SetBuilder, Streamer}; use fst::{SetBuilder, Streamer};
use meilidb_schema::Schema; use meilidb_schema::Schema;
use sdset::{SetBuf, SetOperation, duo::DifferenceByKey}; use sdset::{duo::DifferenceByKey, SetBuf, SetOperation};
use crate::{DocumentId, RankedMap, MResult, Error};
use crate::serde::extract_document_id; use crate::serde::extract_document_id;
use crate::update::{Update, next_update_id};
use crate::store; use crate::store;
use crate::update::{next_update_id, Update};
use crate::{DocumentId, Error, MResult, RankedMap};
pub struct DocumentsDeletion { pub struct DocumentsDeletion {
updates_store: store::Updates, updates_store: store::Updates,
@ -21,8 +21,7 @@ impl DocumentsDeletion {
updates_store: store::Updates, updates_store: store::Updates,
updates_results_store: store::UpdatesResults, updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>, updates_notifier: crossbeam_channel::Sender<()>,
) -> DocumentsDeletion ) -> DocumentsDeletion {
{
DocumentsDeletion { DocumentsDeletion {
updates_store, updates_store,
updates_results_store, updates_results_store,
@ -36,7 +35,8 @@ impl DocumentsDeletion {
} }
pub fn delete_document<D>(&mut self, schema: &Schema, document: D) -> MResult<()> pub fn delete_document<D>(&mut self, schema: &Schema, document: D) -> MResult<()>
where D: serde::Serialize, where
D: serde::Serialize,
{ {
let identifier = schema.identifier_name(); let identifier = schema.identifier_name();
let document_id = match extract_document_id(identifier, &document)? { let document_id = match extract_document_id(identifier, &document)? {
@ -62,7 +62,7 @@ impl DocumentsDeletion {
} }
impl Extend<DocumentId> for DocumentsDeletion { impl Extend<DocumentId> for DocumentsDeletion {
fn extend<T: IntoIterator<Item=DocumentId>>(&mut self, iter: T) { fn extend<T: IntoIterator<Item = DocumentId>>(&mut self, iter: T) {
self.documents.extend(iter) self.documents.extend(iter)
} }
} }
@ -72,8 +72,7 @@ pub fn push_documents_deletion(
updates_store: store::Updates, updates_store: store::Updates,
updates_results_store: store::UpdatesResults, updates_results_store: store::UpdatesResults,
deletion: Vec<DocumentId>, deletion: Vec<DocumentId>,
) -> MResult<u64> ) -> MResult<u64> {
{
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?; let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::DocumentsDeletion(deletion); let update = Update::DocumentsDeletion(deletion);
@ -91,8 +90,7 @@ pub fn apply_documents_deletion(
docs_words_store: store::DocsWords, docs_words_store: store::DocsWords,
mut ranked_map: RankedMap, mut ranked_map: RankedMap,
deletion: Vec<DocumentId>, deletion: Vec<DocumentId>,
) -> MResult<()> ) -> MResult<()> {
{
let idset = SetBuf::from_dirty(deletion); let idset = SetBuf::from_dirty(deletion);
let schema = match main_store.schema(writer)? { let schema = match main_store.schema(writer)? {
@ -101,10 +99,17 @@ pub fn apply_documents_deletion(
}; };
// collect the ranked attributes according to the schema // collect the ranked attributes according to the schema
let ranked_attrs: Vec<_> = schema.iter() let ranked_attrs: Vec<_> = schema
.filter_map(|(_, attr, prop)| { .iter()
if prop.is_ranked() { Some(attr) } else { None } .filter_map(
}) |(_, attr, prop)| {
if prop.is_ranked() {
Some(attr)
} else {
None
}
},
)
.collect(); .collect();
let mut words_document_ids = HashMap::new(); let mut words_document_ids = HashMap::new();
@ -118,7 +123,10 @@ pub fn apply_documents_deletion(
let mut stream = words.stream(); let mut stream = words.stream();
while let Some(word) = stream.next() { while let Some(word) = stream.next() {
let word = word.to_vec(); let word = word.to_vec();
words_document_ids.entry(word).or_insert_with(Vec::new).push(id); words_document_ids
.entry(word)
.or_insert_with(Vec::new)
.push(id);
} }
} }
} }
@ -167,7 +175,7 @@ pub fn apply_documents_deletion(
.into_inner() .into_inner()
.and_then(fst::Set::from_bytes) .and_then(fst::Set::from_bytes)
.unwrap() .unwrap()
}, }
None => fst::Set::default(), None => fst::Set::default(),
}; };

View File

@ -6,21 +6,21 @@ mod synonyms_addition;
mod synonyms_deletion; mod synonyms_deletion;
pub use self::customs_update::{apply_customs_update, push_customs_update}; pub use self::customs_update::{apply_customs_update, push_customs_update};
pub use self::documents_addition::{DocumentsAddition, apply_documents_addition}; pub use self::documents_addition::{apply_documents_addition, DocumentsAddition};
pub use self::documents_deletion::{DocumentsDeletion, apply_documents_deletion}; pub use self::documents_deletion::{apply_documents_deletion, DocumentsDeletion};
pub use self::schema_update::{apply_schema_update, push_schema_update}; pub use self::schema_update::{apply_schema_update, push_schema_update};
pub use self::synonyms_addition::{SynonymsAddition, apply_synonyms_addition}; pub use self::synonyms_addition::{apply_synonyms_addition, SynonymsAddition};
pub use self::synonyms_deletion::{SynonymsDeletion, apply_synonyms_deletion}; pub use self::synonyms_deletion::{apply_synonyms_deletion, SynonymsDeletion};
use std::time::{Duration, Instant};
use std::collections::BTreeMap;
use std::cmp; use std::cmp;
use std::collections::BTreeMap;
use std::time::{Duration, Instant};
use log::debug; use log::debug;
use serde::{Serialize, Deserialize}; use serde::{Deserialize, Serialize};
use zlmdb::Result as ZResult; use zlmdb::Result as ZResult;
use crate::{store, MResult, DocumentId, RankedMap}; use crate::{store, DocumentId, MResult, RankedMap};
use meilidb_schema::Schema; use meilidb_schema::Schema;
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
@ -68,8 +68,7 @@ pub fn update_status(
updates_store: store::Updates, updates_store: store::Updates,
updates_results_store: store::UpdatesResults, updates_results_store: store::UpdatesResults,
update_id: u64, update_id: u64,
) -> MResult<UpdateStatus> ) -> MResult<UpdateStatus> {
{
match updates_results_store.update_result(reader, update_id)? { match updates_results_store.update_result(reader, update_id)? {
Some(result) => Ok(UpdateStatus::Processed(result)), Some(result) => Ok(UpdateStatus::Processed(result)),
None => { None => {
@ -86,8 +85,7 @@ pub fn next_update_id(
writer: &mut zlmdb::RwTxn, writer: &mut zlmdb::RwTxn,
updates_store: store::Updates, updates_store: store::Updates,
updates_results_store: store::UpdatesResults, updates_results_store: store::UpdatesResults,
) -> ZResult<u64> ) -> ZResult<u64> {
{
let last_update_id = updates_store.last_update_id(writer)?; let last_update_id = updates_store.last_update_id(writer)?;
let last_update_id = last_update_id.map(|(n, _)| n); let last_update_id = last_update_id.map(|(n, _)| n);
@ -100,7 +98,10 @@ pub fn next_update_id(
Ok(new_update_id) Ok(new_update_id)
} }
pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult<Option<UpdateResult>> { pub fn update_task(
writer: &mut zlmdb::RwTxn,
index: store::Index,
) -> MResult<Option<UpdateResult>> {
let (update_id, update) = match index.updates.pop_front(writer)? { let (update_id, update) = match index.updates.pop_front(writer)? {
Some(value) => value, Some(value) => value,
None => return Ok(None), None => return Ok(None),
@ -112,11 +113,13 @@ pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult<Op
Update::Schema(schema) => { Update::Schema(schema) => {
let start = Instant::now(); let start = Instant::now();
let update_type = UpdateType::Schema { schema: schema.clone() }; let update_type = UpdateType::Schema {
schema: schema.clone(),
};
let result = apply_schema_update(writer, index.main, &schema); let result = apply_schema_update(writer, index.main, &schema);
(update_type, result, start.elapsed()) (update_type, result, start.elapsed())
}, }
Update::Customs(customs) => { Update::Customs(customs) => {
let start = Instant::now(); let start = Instant::now();
@ -133,7 +136,9 @@ pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult<Op
None => RankedMap::default(), None => RankedMap::default(),
}; };
let update_type = UpdateType::DocumentsAddition { number: documents.len() }; let update_type = UpdateType::DocumentsAddition {
number: documents.len(),
};
let result = apply_documents_addition( let result = apply_documents_addition(
writer, writer,
@ -147,7 +152,7 @@ pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult<Op
); );
(update_type, result, start.elapsed()) (update_type, result, start.elapsed())
}, }
Update::DocumentsDeletion(documents) => { Update::DocumentsDeletion(documents) => {
let start = Instant::now(); let start = Instant::now();
@ -156,7 +161,9 @@ pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult<Op
None => RankedMap::default(), None => RankedMap::default(),
}; };
let update_type = UpdateType::DocumentsDeletion { number: documents.len() }; let update_type = UpdateType::DocumentsDeletion {
number: documents.len(),
};
let result = apply_documents_deletion( let result = apply_documents_deletion(
writer, writer,
@ -170,38 +177,35 @@ pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult<Op
); );
(update_type, result, start.elapsed()) (update_type, result, start.elapsed())
}, }
Update::SynonymsAddition(synonyms) => { Update::SynonymsAddition(synonyms) => {
let start = Instant::now(); let start = Instant::now();
let update_type = UpdateType::SynonymsAddition { number: synonyms.len() }; let update_type = UpdateType::SynonymsAddition {
number: synonyms.len(),
};
let result = apply_synonyms_addition( let result = apply_synonyms_addition(writer, index.main, index.synonyms, synonyms);
writer,
index.main,
index.synonyms,
synonyms,
);
(update_type, result, start.elapsed()) (update_type, result, start.elapsed())
}, }
Update::SynonymsDeletion(synonyms) => { Update::SynonymsDeletion(synonyms) => {
let start = Instant::now(); let start = Instant::now();
let update_type = UpdateType::SynonymsDeletion { number: synonyms.len() }; let update_type = UpdateType::SynonymsDeletion {
number: synonyms.len(),
};
let result = apply_synonyms_deletion( let result = apply_synonyms_deletion(writer, index.main, index.synonyms, synonyms);
writer,
index.main,
index.synonyms,
synonyms,
);
(update_type, result, start.elapsed()) (update_type, result, start.elapsed())
}, }
}; };
debug!("Processed update number {} {:?} {:?}", update_id, update_type, result); debug!(
"Processed update number {} {:?} {:?}",
update_id, update_type, result
);
let detailed_duration = DetailedDuration { main: duration }; let detailed_duration = DetailedDuration { main: duration };
let status = UpdateResult { let status = UpdateResult {
@ -211,7 +215,9 @@ pub fn update_task(writer: &mut zlmdb::RwTxn, index: store::Index) -> MResult<Op
detailed_duration, detailed_duration,
}; };
index.updates_results.put_update_result(writer, update_id, &status)?; index
.updates_results
.put_update_result(writer, update_id, &status)?;
Ok(Some(status)) Ok(Some(status))
} }

View File

@ -1,18 +1,19 @@
use crate::update::{next_update_id, Update};
use crate::{error::UnsupportedOperation, store, MResult};
use meilidb_schema::Schema; use meilidb_schema::Schema;
use crate::{store, error::UnsupportedOperation, MResult};
use crate::update::{Update, next_update_id};
pub fn apply_schema_update( pub fn apply_schema_update(
writer: &mut zlmdb::RwTxn, writer: &mut zlmdb::RwTxn,
main_store: store::Main, main_store: store::Main,
new_schema: &Schema, new_schema: &Schema,
) -> MResult<()> ) -> MResult<()> {
{ if main_store.schema(writer)?.is_some() {
if let Some(_) = main_store.schema(writer)? { return Err(UnsupportedOperation::SchemaAlreadyExists.into());
return Err(UnsupportedOperation::SchemaAlreadyExists.into())
} }
main_store.put_schema(writer, new_schema).map_err(Into::into) main_store
.put_schema(writer, new_schema)
.map_err(Into::into)
} }
pub fn push_schema_update( pub fn push_schema_update(
@ -20,8 +21,7 @@ pub fn push_schema_update(
updates_store: store::Updates, updates_store: store::Updates,
updates_results_store: store::UpdatesResults, updates_results_store: store::UpdatesResults,
schema: Schema, schema: Schema,
) -> MResult<u64> ) -> MResult<u64> {
{
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?; let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::Schema(schema); let update = Update::Schema(schema);

View File

@ -1,10 +1,10 @@
use std::collections::BTreeMap; use std::collections::BTreeMap;
use fst::{SetBuilder, set::OpBuilder}; use fst::{set::OpBuilder, SetBuilder};
use sdset::SetBuf; use sdset::SetBuf;
use crate::automaton::normalize_str; use crate::automaton::normalize_str;
use crate::update::{Update, next_update_id}; use crate::update::{next_update_id, Update};
use crate::{store, MResult}; use crate::{store, MResult};
pub struct SynonymsAddition { pub struct SynonymsAddition {
@ -19,8 +19,7 @@ impl SynonymsAddition {
updates_store: store::Updates, updates_store: store::Updates,
updates_results_store: store::UpdatesResults, updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>, updates_notifier: crossbeam_channel::Sender<()>,
) -> SynonymsAddition ) -> SynonymsAddition {
{
SynonymsAddition { SynonymsAddition {
updates_store, updates_store,
updates_results_store, updates_results_store,
@ -30,13 +29,17 @@ impl SynonymsAddition {
} }
pub fn add_synonym<S, T, I>(&mut self, synonym: S, alternatives: I) pub fn add_synonym<S, T, I>(&mut self, synonym: S, alternatives: I)
where S: AsRef<str>, where
T: AsRef<str>, S: AsRef<str>,
I: IntoIterator<Item=T>, T: AsRef<str>,
I: IntoIterator<Item = T>,
{ {
let synonym = normalize_str(synonym.as_ref()); let synonym = normalize_str(synonym.as_ref());
let alternatives = alternatives.into_iter().map(|s| s.as_ref().to_lowercase()); let alternatives = alternatives.into_iter().map(|s| s.as_ref().to_lowercase());
self.synonyms.entry(synonym).or_insert_with(Vec::new).extend(alternatives); self.synonyms
.entry(synonym)
.or_insert_with(Vec::new)
.extend(alternatives);
} }
pub fn finalize(self, writer: &mut zlmdb::RwTxn) -> MResult<u64> { pub fn finalize(self, writer: &mut zlmdb::RwTxn) -> MResult<u64> {
@ -56,8 +59,7 @@ pub fn push_synonyms_addition(
updates_store: store::Updates, updates_store: store::Updates,
updates_results_store: store::UpdatesResults, updates_results_store: store::UpdatesResults,
addition: BTreeMap<String, Vec<String>>, addition: BTreeMap<String, Vec<String>>,
) -> MResult<u64> ) -> MResult<u64> {
{
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?; let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::SynonymsAddition(addition); let update = Update::SynonymsAddition(addition);
@ -71,8 +73,7 @@ pub fn apply_synonyms_addition(
main_store: store::Main, main_store: store::Main,
synonyms_store: store::Synonyms, synonyms_store: store::Synonyms,
addition: BTreeMap<String, Vec<String>>, addition: BTreeMap<String, Vec<String>>,
) -> MResult<()> ) -> MResult<()> {
{
let mut synonyms_builder = SetBuilder::memory(); let mut synonyms_builder = SetBuilder::memory();
for (word, alternatives) in addition { for (word, alternatives) in addition {
@ -107,7 +108,7 @@ pub fn apply_synonyms_addition(
.into_inner() .into_inner()
.and_then(fst::Set::from_bytes) .and_then(fst::Set::from_bytes)
.unwrap() .unwrap()
}, }
None => delta_synonyms, None => delta_synonyms,
}; };

View File

@ -1,11 +1,11 @@
use std::collections::BTreeMap; use std::collections::BTreeMap;
use std::iter::FromIterator; use std::iter::FromIterator;
use fst::{SetBuilder, set::OpBuilder}; use fst::{set::OpBuilder, SetBuilder};
use sdset::SetBuf; use sdset::SetBuf;
use crate::automaton::normalize_str; use crate::automaton::normalize_str;
use crate::update::{Update, next_update_id}; use crate::update::{next_update_id, Update};
use crate::{store, MResult}; use crate::{store, MResult};
pub struct SynonymsDeletion { pub struct SynonymsDeletion {
@ -20,8 +20,7 @@ impl SynonymsDeletion {
updates_store: store::Updates, updates_store: store::Updates,
updates_results_store: store::UpdatesResults, updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>, updates_notifier: crossbeam_channel::Sender<()>,
) -> SynonymsDeletion ) -> SynonymsDeletion {
{
SynonymsDeletion { SynonymsDeletion {
updates_store, updates_store,
updates_results_store, updates_results_store,
@ -36,9 +35,10 @@ impl SynonymsDeletion {
} }
pub fn delete_specific_alternatives_of<S, T, I>(&mut self, synonym: S, alternatives: I) pub fn delete_specific_alternatives_of<S, T, I>(&mut self, synonym: S, alternatives: I)
where S: AsRef<str>, where
T: AsRef<str>, S: AsRef<str>,
I: Iterator<Item=T>, T: AsRef<str>,
I: Iterator<Item = T>,
{ {
let synonym = normalize_str(synonym.as_ref()); let synonym = normalize_str(synonym.as_ref());
let value = self.synonyms.entry(synonym).or_insert(None); let value = self.synonyms.entry(synonym).or_insert(None);
@ -66,8 +66,7 @@ pub fn push_synonyms_deletion(
updates_store: store::Updates, updates_store: store::Updates,
updates_results_store: store::UpdatesResults, updates_results_store: store::UpdatesResults,
deletion: BTreeMap<String, Option<Vec<String>>>, deletion: BTreeMap<String, Option<Vec<String>>>,
) -> MResult<u64> ) -> MResult<u64> {
{
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?; let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::SynonymsDeletion(deletion); let update = Update::SynonymsDeletion(deletion);
@ -81,8 +80,7 @@ pub fn apply_synonyms_deletion(
main_store: store::Main, main_store: store::Main,
synonyms_store: store::Synonyms, synonyms_store: store::Synonyms,
deletion: BTreeMap<String, Option<Vec<String>>>, deletion: BTreeMap<String, Option<Vec<String>>>,
) -> MResult<()> ) -> MResult<()> {
{
let mut delete_whole_synonym_builder = SetBuilder::memory(); let mut delete_whole_synonym_builder = SetBuilder::memory();
for (synonym, alternatives) in deletion { for (synonym, alternatives) in deletion {
@ -98,9 +96,7 @@ pub fn apply_synonyms_deletion(
let alternatives = SetBuf::from_dirty(alternatives); let alternatives = SetBuf::from_dirty(alternatives);
let mut builder = SetBuilder::memory(); let mut builder = SetBuilder::memory();
builder.extend_iter(alternatives).unwrap(); builder.extend_iter(alternatives).unwrap();
builder.into_inner() builder.into_inner().and_then(fst::Set::from_bytes).unwrap()
.and_then(fst::Set::from_bytes)
.unwrap()
}; };
let op = OpBuilder::new() let op = OpBuilder::new()
@ -124,7 +120,7 @@ pub fn apply_synonyms_deletion(
} else { } else {
synonyms_store.put_synonyms(writer, synonym.as_bytes(), &alternatives)?; synonyms_store.put_synonyms(writer, synonym.as_bytes(), &alternatives)?;
} }
}, }
None => { None => {
delete_whole_synonym_builder.insert(&synonym).unwrap(); delete_whole_synonym_builder.insert(&synonym).unwrap();
synonyms_store.del_synonyms(writer, synonym.as_bytes())?; synonyms_store.del_synonyms(writer, synonym.as_bytes())?;
@ -150,7 +146,7 @@ pub fn apply_synonyms_deletion(
.into_inner() .into_inner()
.and_then(fst::Set::from_bytes) .and_then(fst::Set::from_bytes)
.unwrap() .unwrap()
}, }
None => fst::Set::default(), None => fst::Set::default(),
}; };

View File

@ -1,14 +1,26 @@
use std::collections::{HashMap, BTreeMap}; use std::collections::{BTreeMap, HashMap};
use std::{fmt, u16};
use std::ops::BitOr; use std::ops::BitOr;
use std::sync::Arc; use std::sync::Arc;
use std::{fmt, u16};
use serde::{Serialize, Deserialize};
use indexmap::IndexMap; use indexmap::IndexMap;
use serde::{Deserialize, Serialize};
pub const DISPLAYED: SchemaProps = SchemaProps { displayed: true, indexed: false, ranked: false }; pub const DISPLAYED: SchemaProps = SchemaProps {
pub const INDEXED: SchemaProps = SchemaProps { displayed: false, indexed: true, ranked: false }; displayed: true,
pub const RANKED: SchemaProps = SchemaProps { displayed: false, indexed: false, ranked: true }; indexed: false,
ranked: false,
};
pub const INDEXED: SchemaProps = SchemaProps {
displayed: false,
indexed: true,
ranked: false,
};
pub const RANKED: SchemaProps = SchemaProps {
displayed: false,
indexed: false,
ranked: true,
};
#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)] #[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct SchemaProps { pub struct SchemaProps {
@ -80,7 +92,13 @@ impl SchemaBuilder {
} }
let identifier = self.identifier; let identifier = self.identifier;
Schema { inner: Arc::new(InnerSchema { identifier, attrs, props }) } Schema {
inner: Arc::new(InnerSchema {
identifier,
attrs,
props,
}),
}
} }
} }
@ -100,7 +118,10 @@ impl Schema {
fn to_builder(&self) -> SchemaBuilder { fn to_builder(&self) -> SchemaBuilder {
let identifier = self.inner.identifier.clone(); let identifier = self.inner.identifier.clone();
let attributes = self.attributes_ordered(); let attributes = self.attributes_ordered();
SchemaBuilder { identifier, attributes } SchemaBuilder {
identifier,
attributes,
}
} }
fn attributes_ordered(&self) -> IndexMap<String, SchemaProps> { fn attributes_ordered(&self) -> IndexMap<String, SchemaProps> {
@ -136,18 +157,18 @@ impl Schema {
name name
} }
pub fn iter<'a>(&'a self) -> impl Iterator<Item=(&str, SchemaAttr, SchemaProps)> + 'a { pub fn iter<'a>(&'a self) -> impl Iterator<Item = (&str, SchemaAttr, SchemaProps)> + 'a {
self.inner.props.iter() self.inner.props.iter().map(move |(name, prop)| {
.map(move |(name, prop)| { let attr = self.inner.attrs.get(name).unwrap();
let attr = self.inner.attrs.get(name).unwrap(); (name.as_str(), *attr, *prop)
(name.as_str(), *attr, *prop) })
})
} }
} }
impl Serialize for Schema { impl Serialize for Schema {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: serde::ser::Serializer, where
S: serde::ser::Serializer,
{ {
self.to_builder().serialize(serializer) self.to_builder().serialize(serializer)
} }
@ -155,15 +176,15 @@ impl Serialize for Schema {
impl<'de> Deserialize<'de> for Schema { impl<'de> Deserialize<'de> for Schema {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where D: serde::de::Deserializer<'de>, where
D: serde::de::Deserializer<'de>,
{ {
let builder = SchemaBuilder::deserialize(deserializer)?; let builder = SchemaBuilder::deserialize(deserializer)?;
Ok(builder.build()) Ok(builder.build())
} }
} }
#[derive(Serialize, Deserialize)] #[derive(Serialize, Deserialize, Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
pub struct SchemaAttr(pub u16); pub struct SchemaAttr(pub u16);
impl SchemaAttr { impl SchemaAttr {

View File

@ -1,17 +1,17 @@
use std::iter::Peekable;
use slice_group_by::StrGroupBy;
use self::SeparatorCategory::*; use self::SeparatorCategory::*;
use slice_group_by::StrGroupBy;
use std::iter::Peekable;
pub fn is_cjk(c: char) -> bool { pub fn is_cjk(c: char) -> bool {
(c >= '\u{2e80}' && c <= '\u{2eff}') || (c >= '\u{2e80}' && c <= '\u{2eff}')
(c >= '\u{2f00}' && c <= '\u{2fdf}') || || (c >= '\u{2f00}' && c <= '\u{2fdf}')
(c >= '\u{3040}' && c <= '\u{309f}') || || (c >= '\u{3040}' && c <= '\u{309f}')
(c >= '\u{30a0}' && c <= '\u{30ff}') || || (c >= '\u{30a0}' && c <= '\u{30ff}')
(c >= '\u{3100}' && c <= '\u{312f}') || || (c >= '\u{3100}' && c <= '\u{312f}')
(c >= '\u{3200}' && c <= '\u{32ff}') || || (c >= '\u{3200}' && c <= '\u{32ff}')
(c >= '\u{3400}' && c <= '\u{4dbf}') || || (c >= '\u{3400}' && c <= '\u{4dbf}')
(c >= '\u{4e00}' && c <= '\u{9fff}') || || (c >= '\u{4e00}' && c <= '\u{9fff}')
(c >= '\u{f900}' && c <= '\u{faff}') || (c >= '\u{f900}' && c <= '\u{faff}')
} }
#[derive(Debug, Copy, Clone, PartialEq, Eq)] #[derive(Debug, Copy, Clone, PartialEq, Eq)]
@ -22,7 +22,11 @@ enum SeparatorCategory {
impl SeparatorCategory { impl SeparatorCategory {
fn merge(self, other: SeparatorCategory) -> SeparatorCategory { fn merge(self, other: SeparatorCategory) -> SeparatorCategory {
if let (Soft, Soft) = (self, other) { Soft } else { Hard } if let (Soft, Soft) = (self, other) {
Soft
} else {
Hard
}
} }
fn to_usize(self) -> usize { fn to_usize(self) -> usize {
@ -40,7 +44,7 @@ fn is_separator(c: char) -> bool {
fn classify_separator(c: char) -> Option<SeparatorCategory> { fn classify_separator(c: char) -> Option<SeparatorCategory> {
match c { match c {
' ' | '-' | '_' | '\'' | ':' | '"' => Some(Soft), ' ' | '-' | '_' | '\'' | ':' | '"' => Some(Soft),
'.' | ';' | ',' | '!' | '?' | '(' | ')' => Some(Hard), '.' | ';' | ',' | '!' | '?' | '(' | ')' => Some(Hard),
_ => None, _ => None,
} }
} }
@ -79,7 +83,7 @@ fn chars_count_index((n, _): (usize, usize), (i, c): (usize, char)) -> (usize, u
(n + 1, i + c.len_utf8()) (n + 1, i + c.len_utf8())
} }
pub fn split_query_string(query: &str) -> impl Iterator<Item=&str> { pub fn split_query_string(query: &str) -> impl Iterator<Item = &str> {
Tokenizer::new(query).map(|t| t.word) Tokenizer::new(query).map(|t| t.word)
} }
@ -100,9 +104,10 @@ impl<'a> Tokenizer<'a> {
pub fn new(string: &str) -> Tokenizer { pub fn new(string: &str) -> Tokenizer {
// skip every separator and set `char_index` // skip every separator and set `char_index`
// to the number of char trimmed // to the number of char trimmed
let (count, index) = string.char_indices() let (count, index) = string
.take_while(|(_, c)| is_separator(*c)) .char_indices()
.fold((0, 0), chars_count_index); .take_while(|(_, c)| is_separator(*c))
.fold((0, 0), chars_count_index);
Tokenizer { Tokenizer {
inner: &string[index..], inner: &string[index..],
@ -122,10 +127,11 @@ impl<'a> Iterator for Tokenizer<'a> {
let (count, index) = string.char_indices().fold((0, 0), chars_count_index); let (count, index) = string.char_indices().fold((0, 0), chars_count_index);
if !is_str_word(string) { if !is_str_word(string) {
self.word_index += string.chars() self.word_index += string
.filter_map(classify_separator) .chars()
.fold(Soft, |a, x| a.merge(x)) .filter_map(classify_separator)
.to_usize(); .fold(Soft, |a, x| a.merge(x))
.to_usize();
self.char_index += count; self.char_index += count;
self.inner = &self.inner[index..]; self.inner = &self.inner[index..];
continue; continue;
@ -153,7 +159,8 @@ impl<'a> Iterator for Tokenizer<'a> {
} }
pub struct SeqTokenizer<'a, I> pub struct SeqTokenizer<'a, I>
where I: Iterator<Item=&'a str>, where
I: Iterator<Item = &'a str>,
{ {
inner: I, inner: I,
current: Option<Peekable<Tokenizer<'a>>>, current: Option<Peekable<Tokenizer<'a>>>,
@ -162,13 +169,14 @@ where I: Iterator<Item=&'a str>,
} }
impl<'a, I> SeqTokenizer<'a, I> impl<'a, I> SeqTokenizer<'a, I>
where I: Iterator<Item=&'a str>, where
I: Iterator<Item = &'a str>,
{ {
pub fn new(mut iter: I) -> SeqTokenizer<'a, I> { pub fn new(mut iter: I) -> SeqTokenizer<'a, I> {
let current = iter.next().map(|s| Tokenizer::new(s).peekable()); let current = iter.next().map(|s| Tokenizer::new(s).peekable());
SeqTokenizer { SeqTokenizer {
inner: iter, inner: iter,
current: current, current,
word_offset: 0, word_offset: 0,
char_offset: 0, char_offset: 0,
} }
@ -176,7 +184,8 @@ where I: Iterator<Item=&'a str>,
} }
impl<'a, I> Iterator for SeqTokenizer<'a, I> impl<'a, I> Iterator for SeqTokenizer<'a, I>
where I: Iterator<Item=&'a str>, where
I: Iterator<Item = &'a str>,
{ {
type Item = Token<'a>; type Item = Token<'a>;
@ -202,15 +211,15 @@ where I: Iterator<Item=&'a str>,
} }
Some(token) Some(token)
}, }
None => { None => {
// no more words in this text we must // no more words in this text we must
// start tokenizing the next text // start tokenizing the next text
self.current = self.inner.next().map(|s| Tokenizer::new(s).peekable()); self.current = self.inner.next().map(|s| Tokenizer::new(s).peekable());
self.next() self.next()
}, }
} }
}, }
// no more texts available // no more texts available
None => None, None => None,
} }
@ -225,12 +234,26 @@ mod tests {
fn easy() { fn easy() {
let mut tokenizer = Tokenizer::new("salut"); let mut tokenizer = Tokenizer::new("salut");
assert_eq!(tokenizer.next(), Some(Token { word: "salut", word_index: 0, char_index: 0 })); assert_eq!(
tokenizer.next(),
Some(Token {
word: "salut",
word_index: 0,
char_index: 0
})
);
assert_eq!(tokenizer.next(), None); assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo "); let mut tokenizer = Tokenizer::new("yo ");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 })); assert_eq!(
tokenizer.next(),
Some(Token {
word: "yo",
word_index: 0,
char_index: 0
})
);
assert_eq!(tokenizer.next(), None); assert_eq!(tokenizer.next(), None);
} }
@ -238,19 +261,82 @@ mod tests {
fn hard() { fn hard() {
let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe (ouch)"); let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe (ouch)");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 })); assert_eq!(
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 })); tokenizer.next(),
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 })); Some(Token {
assert_eq!(tokenizer.next(), Some(Token { word: "ouch", word_index: 17, char_index: 18 })); word: "yo",
word_index: 0,
char_index: 4
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "lolo",
word_index: 1,
char_index: 7
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "aïe",
word_index: 9,
char_index: 13
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "ouch",
word_index: 17,
char_index: 18
})
);
assert_eq!(tokenizer.next(), None); assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,"); let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 })); assert_eq!(
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 })); tokenizer.next(),
assert_eq!(tokenizer.next(), Some(Token { word: "wtf", word_index: 16, char_index: 12 })); Some(Token {
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 17, char_index: 18 })); word: "yo",
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 25, char_index: 24 })); word_index: 0,
char_index: 0
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "lolo",
word_index: 8,
char_index: 5
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "wtf",
word_index: 16,
char_index: 12
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "lol",
word_index: 17,
char_index: 18
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "aïe",
word_index: 25,
char_index: 24
})
);
assert_eq!(tokenizer.next(), None); assert_eq!(tokenizer.next(), None);
} }
@ -258,18 +344,74 @@ mod tests {
fn hard_long_chars() { fn hard_long_chars() {
let mut tokenizer = Tokenizer::new(" .? yo 😂. aïe"); let mut tokenizer = Tokenizer::new(" .? yo 😂. aïe");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 })); assert_eq!(
assert_eq!(tokenizer.next(), Some(Token { word: "😂", word_index: 1, char_index: 7 })); tokenizer.next(),
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 10 })); Some(Token {
word: "yo",
word_index: 0,
char_index: 4
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "😂",
word_index: 1,
char_index: 7
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "aïe",
word_index: 9,
char_index: 10
})
);
assert_eq!(tokenizer.next(), None); assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,"); let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 })); assert_eq!(
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 })); tokenizer.next(),
assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 })); Some(Token {
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 17, char_index: 16 })); word: "yo",
assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 25, char_index: 22 })); word_index: 0,
char_index: 0
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "lolo",
word_index: 8,
char_index: 5
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "😱",
word_index: 16,
char_index: 12
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "lol",
word_index: 17,
char_index: 16
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "😣",
word_index: 25,
char_index: 22
})
);
assert_eq!(tokenizer.next(), None); assert_eq!(tokenizer.next(), None);
} }
@ -277,19 +419,82 @@ mod tests {
fn hard_kanjis() { fn hard_kanjis() {
let mut tokenizer = Tokenizer::new("\u{2ec4}lolilol\u{2ec7}"); let mut tokenizer = Tokenizer::new("\u{2ec4}lolilol\u{2ec7}");
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 })); assert_eq!(
assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 1, char_index: 1 })); tokenizer.next(),
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 2, char_index: 8 })); Some(Token {
word: "\u{2ec4}",
word_index: 0,
char_index: 0
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "lolilol",
word_index: 1,
char_index: 1
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "\u{2ec7}",
word_index: 2,
char_index: 8
})
);
assert_eq!(tokenizer.next(), None); assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("\u{2ec4}\u{2ed3}\u{2ef2} lolilol - hello \u{2ec7}"); let mut tokenizer = Tokenizer::new("\u{2ec4}\u{2ed3}\u{2ef2} lolilol - hello \u{2ec7}");
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 })); assert_eq!(
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ed3}", word_index: 1, char_index: 1 })); tokenizer.next(),
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ef2}", word_index: 2, char_index: 2 })); Some(Token {
assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 3, char_index: 4 })); word: "\u{2ec4}",
assert_eq!(tokenizer.next(), Some(Token { word: "hello", word_index: 4, char_index: 14 })); word_index: 0,
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 5, char_index: 23 })); char_index: 0
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "\u{2ed3}",
word_index: 1,
char_index: 1
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "\u{2ef2}",
word_index: 2,
char_index: 2
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "lolilol",
word_index: 3,
char_index: 4
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "hello",
word_index: 4,
char_index: 14
})
);
assert_eq!(
tokenizer.next(),
Some(Token {
word: "\u{2ec7}",
word_index: 5,
char_index: 23
})
);
assert_eq!(tokenizer.next(), None); assert_eq!(tokenizer.next(), None);
} }
} }