format the whole project

This commit is contained in:
Tamo 2021-06-16 18:33:33 +02:00
parent ba30cef987
commit 9716fb3b36
No known key found for this signature in database
GPG Key ID: 20CD8020AFA88D69
68 changed files with 3327 additions and 2336 deletions

5
.rustfmt.toml Normal file
View File

@ -0,0 +1,5 @@
unstable_features = true
use_small_heuristics = "max"
imports_granularity = "Module"
group_imports = "StdExternalCrate"

View File

@ -41,3 +41,18 @@ the `content-type:application/json` and `content-type:application/x-ndjson` head
### Querying the engine via the website ### Querying the engine via the website
You can query the engine by going to [the HTML page itself](http://127.0.0.1:9700). You can query the engine by going to [the HTML page itself](http://127.0.0.1:9700).
## Contributing
You can setup a `git-hook` to stop you from making a commit too fast. It'll stop you if:
- Any of the workspaces does not build
- Your code is not well-formatted
These two things are also checked in the CI, so ignoring the hook won't help you merge your code.
But if you need to, you can still add `--no-verify` when creating your commit to ignore the hook.
To enable the hook, run the following command from the root of the project:
```
cp script/pre-commit .git/hooks/pre-commit
```

View File

@ -6,33 +6,24 @@ use milli::update::Settings;
use utils::Conf; use utils::Conf;
fn base_conf(builder: &mut Settings) { fn base_conf(builder: &mut Settings) {
let displayed_fields = [ let displayed_fields =
"id", "title", "album", "artist", "genre", "country", "released", "duration", ["id", "title", "album", "artist", "genre", "country", "released", "duration"]
]
.iter() .iter()
.map(|s| s.to_string()) .map(|s| s.to_string())
.collect(); .collect();
builder.set_displayed_fields(displayed_fields); builder.set_displayed_fields(displayed_fields);
let searchable_fields = ["title", "album", "artist"] let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect();
.iter()
.map(|s| s.to_string())
.collect();
builder.set_searchable_fields(searchable_fields); builder.set_searchable_fields(searchable_fields);
let faceted_fields = [ let faceted_fields = ["released-timestamp", "duration-float", "genre", "country", "artist"]
"released-timestamp",
"duration-float",
"genre",
"country",
"artist",
]
.iter() .iter()
.map(|s| s.to_string()) .map(|s| s.to_string())
.collect(); .collect();
builder.set_filterable_fields(faceted_fields); builder.set_filterable_fields(faceted_fields);
} }
#[rustfmt::skip]
const BASE_CONF: Conf = Conf { const BASE_CONF: Conf = Conf {
dataset: datasets_paths::SMOL_SONGS, dataset: datasets_paths::SMOL_SONGS,
queries: &[ queries: &[
@ -53,34 +44,25 @@ const BASE_CONF: Conf = Conf {
}; };
fn bench_songs(c: &mut criterion::Criterion) { fn bench_songs(c: &mut criterion::Criterion) {
let default_criterion: Vec<String> = milli::default_criteria() let default_criterion: Vec<String> =
.iter() milli::default_criteria().iter().map(|criteria| criteria.to_string()).collect();
.map(|criteria| criteria.to_string())
.collect();
let default_criterion = default_criterion.iter().map(|s| s.as_str()); let default_criterion = default_criterion.iter().map(|s| s.as_str());
let asc_default: Vec<&str> = std::iter::once("asc(released-timestamp)") let asc_default: Vec<&str> =
.chain(default_criterion.clone()) std::iter::once("asc(released-timestamp)").chain(default_criterion.clone()).collect();
.collect(); let desc_default: Vec<&str> =
let desc_default: Vec<&str> = std::iter::once("desc(released-timestamp)") std::iter::once("desc(released-timestamp)").chain(default_criterion.clone()).collect();
.chain(default_criterion.clone())
.collect();
let basic_with_quote: Vec<String> = BASE_CONF let basic_with_quote: Vec<String> = BASE_CONF
.queries .queries
.iter() .iter()
.map(|s| { .map(|s| {
s.trim() s.trim().split(' ').map(|s| format!(r#""{}""#, s)).collect::<Vec<String>>().join(" ")
.split(' ')
.map(|s| format!(r#""{}""#, s))
.collect::<Vec<String>>()
.join(" ")
}) })
.collect(); .collect();
let basic_with_quote: &[&str] = &basic_with_quote let basic_with_quote: &[&str] =
.iter() &basic_with_quote.iter().map(|s| s.as_str()).collect::<Vec<&str>>();
.map(|s| s.as_str())
.collect::<Vec<&str>>();
#[rustfmt::skip]
let confs = &[ let confs = &[
/* first we bench each criterion alone */ /* first we bench each criterion alone */
utils::Conf { utils::Conf {

View File

@ -3,10 +3,8 @@ use std::path::Path;
use criterion::BenchmarkId; use criterion::BenchmarkId;
use heed::EnvOpenOptions; use heed::EnvOpenOptions;
use milli::{ use milli::update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat};
update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat}, use milli::{FilterCondition, Index};
FilterCondition, Index,
};
pub struct Conf<'a> { pub struct Conf<'a> {
/// where we are going to create our database.mmdb directory /// where we are going to create our database.mmdb directory

View File

@ -6,16 +6,14 @@ use milli::update::Settings;
use utils::Conf; use utils::Conf;
fn base_conf(builder: &mut Settings) { fn base_conf(builder: &mut Settings) {
let displayed_fields = ["title", "body", "url"] let displayed_fields = ["title", "body", "url"].iter().map(|s| s.to_string()).collect();
.iter()
.map(|s| s.to_string())
.collect();
builder.set_displayed_fields(displayed_fields); builder.set_displayed_fields(displayed_fields);
let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect(); let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect();
builder.set_searchable_fields(searchable_fields); builder.set_searchable_fields(searchable_fields);
} }
#[rustfmt::skip]
const BASE_CONF: Conf = Conf { const BASE_CONF: Conf = Conf {
dataset: datasets_paths::SMOL_WIKI_ARTICLES, dataset: datasets_paths::SMOL_WIKI_ARTICLES,
queries: &[ queries: &[
@ -37,18 +35,13 @@ fn bench_songs(c: &mut criterion::Criterion) {
.queries .queries
.iter() .iter()
.map(|s| { .map(|s| {
s.trim() s.trim().split(' ').map(|s| format!(r#""{}""#, s)).collect::<Vec<String>>().join(" ")
.split(' ')
.map(|s| format!(r#""{}""#, s))
.collect::<Vec<String>>()
.join(" ")
}) })
.collect(); .collect();
let basic_with_quote: &[&str] = &basic_with_quote let basic_with_quote: &[&str] =
.iter() &basic_with_quote.iter().map(|s| s.as_str()).collect::<Vec<&str>>();
.map(|s| s.as_str())
.collect::<Vec<&str>>();
#[rustfmt::skip]
let confs = &[ let confs = &[
/* first we bench each criterion alone */ /* first we bench each criterion alone */
utils::Conf { utils::Conf {

View File

@ -1,9 +1,7 @@
use std::fs::File;
use std::io::{Cursor, Read, Seek, Write};
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::{env, fs}; use std::{env, fs};
use std::{
fs::File,
io::{Cursor, Read, Seek, Write},
};
use bytes::Bytes; use bytes::Bytes;
use convert_case::{Case, Casing}; use convert_case::{Case, Casing};
@ -45,7 +43,10 @@ fn main() -> anyhow::Result<()> {
)?; )?;
if out_file.exists() { if out_file.exists() {
eprintln!("The dataset {} already exists on the file system and will not be downloaded again", dataset); eprintln!(
"The dataset {} already exists on the file system and will not be downloaded again",
dataset
);
continue; continue;
} }
let url = format!("{}/{}.csv.gz", BASE_URL, dataset); let url = format!("{}/{}.csv.gz", BASE_URL, dataset);
@ -60,12 +61,8 @@ fn main() -> anyhow::Result<()> {
} }
fn download_dataset<U: IntoUrl>(url: U) -> anyhow::Result<Cursor<Bytes>> { fn download_dataset<U: IntoUrl>(url: U) -> anyhow::Result<Cursor<Bytes>> {
let bytes = reqwest::blocking::Client::builder() let bytes =
.timeout(None) reqwest::blocking::Client::builder().timeout(None).build()?.get(url).send()?.bytes()?;
.build()?
.get(url)
.send()?
.bytes()?;
Ok(Cursor::new(bytes)) Ok(Cursor::new(bytes))
} }

View File

@ -1,9 +1,8 @@
use std::path::PathBuf; use std::path::PathBuf;
use byte_unit::Byte; use byte_unit::Byte;
use heed::{Env, EnvOpenOptions, CompactionOption}; use heed::{CompactionOption, Env, EnvOpenOptions};
use structopt::StructOpt; use structopt::StructOpt;
use Command::*; use Command::*;
#[cfg(target_os = "linux")] #[cfg(target_os = "linux")]
@ -65,7 +64,7 @@ fn main() -> anyhow::Result<()> {
use CompactionOption::*; use CompactionOption::*;
let compaction = if enable_compaction { Enabled } else { Disabled }; let compaction = if enable_compaction { Enabled } else { Disabled };
copy_main_database_to_stdout(env, compaction) copy_main_database_to_stdout(env, compaction)
}, }
} }
} }

View File

@ -1,6 +1,5 @@
mod update_store; mod update_store;
use std::{io, mem};
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
use std::fmt::Display; use std::fmt::Display;
use std::fs::{create_dir_all, File}; use std::fs::{create_dir_all, File};
@ -10,16 +9,19 @@ use std::path::PathBuf;
use std::str::FromStr; use std::str::FromStr;
use std::sync::Arc; use std::sync::Arc;
use std::time::Instant; use std::time::Instant;
use std::{io, mem};
use askama_warp::Template; use askama_warp::Template;
use byte_unit::Byte; use byte_unit::Byte;
use either::Either; use either::Either;
use flate2::read::GzDecoder; use flate2::read::GzDecoder;
use futures::{FutureExt, StreamExt}; use futures::{stream, FutureExt, StreamExt};
use futures::stream;
use grenad::CompressionType; use grenad::CompressionType;
use heed::EnvOpenOptions; use heed::EnvOpenOptions;
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
use milli::update::UpdateIndexingStep::*;
use milli::update::{IndexDocumentsMethod, Setting, UpdateBuilder, UpdateFormat};
use milli::{obkv_to_json, FilterCondition, Index, MatchingWords, SearchResult};
use once_cell::sync::OnceCell; use once_cell::sync::OnceCell;
use rayon::ThreadPool; use rayon::ThreadPool;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
@ -28,12 +30,9 @@ use structopt::StructOpt;
use tokio::fs::File as TFile; use tokio::fs::File as TFile;
use tokio::io::AsyncWriteExt; use tokio::io::AsyncWriteExt;
use tokio::sync::broadcast; use tokio::sync::broadcast;
use warp::{Filter, http::Response};
use warp::filters::ws::Message; use warp::filters::ws::Message;
use warp::http::Response;
use milli::{FilterCondition, Index, MatchingWords, obkv_to_json, SearchResult}; use warp::Filter;
use milli::update::{IndexDocumentsMethod, Setting, UpdateBuilder, UpdateFormat};
use milli::update::UpdateIndexingStep::*;
use self::update_store::UpdateStore; use self::update_store::UpdateStore;
@ -149,25 +148,28 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
for (word, token) in analyzed.reconstruct() { for (word, token) in analyzed.reconstruct() {
if token.is_word() { if token.is_word() {
let to_highlight = matching_words.matching_bytes(token.text()).is_some(); let to_highlight = matching_words.matching_bytes(token.text()).is_some();
if to_highlight { string.push_str("<mark>") } if to_highlight {
string.push_str("<mark>")
}
string.push_str(word); string.push_str(word);
if to_highlight { string.push_str("</mark>") } if to_highlight {
string.push_str("</mark>")
}
} else { } else {
string.push_str(word); string.push_str(word);
} }
} }
Value::String(string) Value::String(string)
} }
Value::Array(values) => { Value::Array(values) => Value::Array(
Value::Array(values.into_iter() values.into_iter().map(|v| self.highlight_value(v, matching_words)).collect(),
.map(|v| self.highlight_value(v, matching_words)) ),
.collect()) Value::Object(object) => Value::Object(
} object
Value::Object(object) => { .into_iter()
Value::Object(object.into_iter()
.map(|(k, v)| (k, self.highlight_value(v, matching_words))) .map(|(k, v)| (k, self.highlight_value(v, matching_words)))
.collect()) .collect(),
} ),
} }
} }
@ -236,12 +238,7 @@ enum UpdateMeta {
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type")] #[serde(tag = "type")]
enum UpdateMetaProgress { enum UpdateMetaProgress {
DocumentsAddition { DocumentsAddition { step: usize, total_steps: usize, current: usize, total: Option<usize> },
step: usize,
total_steps: usize,
current: usize,
total: Option<usize>,
},
} }
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
@ -342,11 +339,13 @@ async fn main() -> anyhow::Result<()> {
update_builder.max_memory(indexer_opt_cloned.max_memory.get_bytes() as usize); update_builder.max_memory(indexer_opt_cloned.max_memory.get_bytes() as usize);
update_builder.linked_hash_map_size(indexer_opt_cloned.linked_hash_map_size); update_builder.linked_hash_map_size(indexer_opt_cloned.linked_hash_map_size);
update_builder.chunk_compression_type(indexer_opt_cloned.chunk_compression_type); update_builder.chunk_compression_type(indexer_opt_cloned.chunk_compression_type);
update_builder.chunk_fusing_shrink_size(indexer_opt_cloned.chunk_fusing_shrink_size.get_bytes()); update_builder
.chunk_fusing_shrink_size(indexer_opt_cloned.chunk_fusing_shrink_size.get_bytes());
let before_update = Instant::now(); let before_update = Instant::now();
// we extract the update type and execute the update itself. // we extract the update type and execute the update itself.
let result: anyhow::Result<()> = match meta { let result: anyhow::Result<()> =
match meta {
UpdateMeta::DocumentsAddition { method, format, encoding } => { UpdateMeta::DocumentsAddition { method, format, encoding } => {
// We must use the write transaction of the update here. // We must use the write transaction of the update here.
let mut wtxn = index_cloned.write_txn()?; let mut wtxn = index_cloned.write_txn()?;
@ -360,8 +359,10 @@ async fn main() -> anyhow::Result<()> {
}; };
match method.as_str() { match method.as_str() {
"replace" => builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments), "replace" => builder
"update" => builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments), .index_documents_method(IndexDocumentsMethod::ReplaceDocuments),
"update" => builder
.index_documents_method(IndexDocumentsMethod::UpdateDocuments),
otherwise => panic!("invalid indexing method {:?}", otherwise), otherwise => panic!("invalid indexing method {:?}", otherwise),
}; };
@ -373,10 +374,18 @@ async fn main() -> anyhow::Result<()> {
let result = builder.execute(reader, |indexing_step, update_id| { let result = builder.execute(reader, |indexing_step, update_id| {
let (current, total) = match indexing_step { let (current, total) = match indexing_step {
TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None), TransformFromUserIntoGenericFormat { documents_seen } => {
ComputeIdsAndMergeDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)), (documents_seen, None)
IndexDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)), }
MergeDataIntoFinalDatabase { databases_seen, total_databases } => (databases_seen, Some(total_databases)), ComputeIdsAndMergeDocuments { documents_seen, total_documents } => {
(documents_seen, Some(total_documents))
}
IndexDocuments { documents_seen, total_documents } => {
(documents_seen, Some(total_documents))
}
MergeDataIntoFinalDatabase { databases_seen, total_databases } => {
(databases_seen, Some(total_databases))
}
}; };
let _ = update_status_sender_cloned.send(UpdateStatus::Progressing { let _ = update_status_sender_cloned.send(UpdateStatus::Progressing {
update_id, update_id,
@ -411,52 +420,66 @@ async fn main() -> anyhow::Result<()> {
// We transpose the settings JSON struct into a real setting update. // We transpose the settings JSON struct into a real setting update.
match settings.searchable_attributes { match settings.searchable_attributes {
Setting::Set(searchable_attributes) => builder.set_searchable_fields(searchable_attributes), Setting::Set(searchable_attributes) => {
builder.set_searchable_fields(searchable_attributes)
}
Setting::Reset => builder.reset_searchable_fields(), Setting::Reset => builder.reset_searchable_fields(),
Setting::NotSet => () Setting::NotSet => (),
} }
// We transpose the settings JSON struct into a real setting update. // We transpose the settings JSON struct into a real setting update.
match settings.displayed_attributes { match settings.displayed_attributes {
Setting::Set(displayed_attributes) => builder.set_displayed_fields(displayed_attributes), Setting::Set(displayed_attributes) => {
builder.set_displayed_fields(displayed_attributes)
}
Setting::Reset => builder.reset_displayed_fields(), Setting::Reset => builder.reset_displayed_fields(),
Setting::NotSet => () Setting::NotSet => (),
} }
// We transpose the settings JSON struct into a real setting update. // We transpose the settings JSON struct into a real setting update.
match settings.filterable_attributes { match settings.filterable_attributes {
Setting::Set(filterable_attributes) => builder.set_filterable_fields(filterable_attributes), Setting::Set(filterable_attributes) => {
builder.set_filterable_fields(filterable_attributes)
}
Setting::Reset => builder.reset_filterable_fields(), Setting::Reset => builder.reset_filterable_fields(),
Setting::NotSet => () Setting::NotSet => (),
} }
// We transpose the settings JSON struct into a real setting update. // We transpose the settings JSON struct into a real setting update.
match settings.criteria { match settings.criteria {
Setting::Set(criteria) => builder.set_criteria(criteria), Setting::Set(criteria) => builder.set_criteria(criteria),
Setting::Reset => builder.reset_criteria(), Setting::Reset => builder.reset_criteria(),
Setting::NotSet => () Setting::NotSet => (),
} }
// We transpose the settings JSON struct into a real setting update. // We transpose the settings JSON struct into a real setting update.
match settings.stop_words { match settings.stop_words {
Setting::Set(stop_words) => builder.set_stop_words(stop_words), Setting::Set(stop_words) => builder.set_stop_words(stop_words),
Setting::Reset => builder.reset_stop_words(), Setting::Reset => builder.reset_stop_words(),
Setting::NotSet => () Setting::NotSet => (),
} }
// We transpose the settings JSON struct into a real setting update. // We transpose the settings JSON struct into a real setting update.
match settings.synonyms { match settings.synonyms {
Setting::Set(synonyms) => builder.set_synonyms(synonyms), Setting::Set(synonyms) => builder.set_synonyms(synonyms),
Setting::Reset => builder.reset_synonyms(), Setting::Reset => builder.reset_synonyms(),
Setting::NotSet => () Setting::NotSet => (),
} }
let result = builder.execute(|indexing_step, update_id| { let result = builder.execute(|indexing_step, update_id| {
let (current, total) = match indexing_step { let (current, total) = match indexing_step {
TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None), TransformFromUserIntoGenericFormat { documents_seen } => {
ComputeIdsAndMergeDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)), (documents_seen, None)
IndexDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)), }
MergeDataIntoFinalDatabase { databases_seen, total_databases } => (databases_seen, Some(total_databases)), ComputeIdsAndMergeDocuments { documents_seen, total_documents } => {
(documents_seen, Some(total_documents))
}
IndexDocuments { documents_seen, total_documents } => {
(documents_seen, Some(total_documents))
}
MergeDataIntoFinalDatabase { databases_seen, total_databases } => {
(databases_seen, Some(total_databases))
}
}; };
let _ = update_status_sender_cloned.send(UpdateStatus::Progressing { let _ = update_status_sender_cloned.send(UpdateStatus::Progressing {
update_id, update_id,
@ -492,7 +515,9 @@ async fn main() -> anyhow::Result<()> {
}; };
let meta = match result { let meta = match result {
Ok(()) => format!("valid update content processed in {:.02?}", before_update.elapsed()), Ok(()) => {
format!("valid update content processed in {:.02?}", before_update.elapsed())
}
Err(e) => format!("error while processing update content: {:?}", e), Err(e) => format!("error while processing update content: {:?}", e),
}; };
@ -500,7 +525,8 @@ async fn main() -> anyhow::Result<()> {
let _ = update_status_sender_cloned.send(processed); let _ = update_status_sender_cloned.send(processed);
Ok(meta) Ok(meta)
})?; },
)?;
// The database name will not change. // The database name will not change.
let db_name = opt.database.file_stem().and_then(|s| s.to_str()).unwrap_or("").to_string(); let db_name = opt.database.file_stem().and_then(|s| s.to_str()).unwrap_or("").to_string();
@ -512,15 +538,11 @@ async fn main() -> anyhow::Result<()> {
let db_name_cloned = db_name.clone(); let db_name_cloned = db_name.clone();
let lmdb_path_cloned = lmdb_path.clone(); let lmdb_path_cloned = lmdb_path.clone();
let index_cloned = index.clone(); let index_cloned = index.clone();
let dash_html_route = warp::filters::method::get() let dash_html_route =
.and(warp::filters::path::end()) warp::filters::method::get().and(warp::filters::path::end()).map(move || {
.map(move || {
// We retrieve the database size. // We retrieve the database size.
let db_size = File::open(lmdb_path_cloned.clone()) let db_size =
.unwrap() File::open(lmdb_path_cloned.clone()).unwrap().metadata().unwrap().len() as usize;
.metadata()
.unwrap()
.len() as usize;
// And the number of documents in the database. // And the number of documents in the database.
let rtxn = index_cloned.read_txn().unwrap(); let rtxn = index_cloned.read_txn().unwrap();
@ -537,7 +559,8 @@ async fn main() -> anyhow::Result<()> {
.and(warp::path!("updates")) .and(warp::path!("updates"))
.map(move |header: String| { .map(move |header: String| {
let update_store = update_store_cloned.clone(); let update_store = update_store_cloned.clone();
let mut updates = update_store.iter_metas(|processed, aborted, pending| { let mut updates = update_store
.iter_metas(|processed, aborted, pending| {
let mut updates = Vec::<UpdateStatus<_, UpdateMetaProgress, _>>::new(); let mut updates = Vec::<UpdateStatus<_, UpdateMetaProgress, _>>::new();
for result in processed { for result in processed {
let (uid, meta) = result?; let (uid, meta) = result?;
@ -552,96 +575,89 @@ async fn main() -> anyhow::Result<()> {
updates.push(UpdateStatus::Pending { update_id: uid.get(), meta }); updates.push(UpdateStatus::Pending { update_id: uid.get(), meta });
} }
Ok(updates) Ok(updates)
}).unwrap(); })
.unwrap();
updates.sort_unstable_by(|s1, s2| s1.update_id().cmp(&s2.update_id()).reverse()); updates.sort_unstable_by(|s1, s2| s1.update_id().cmp(&s2.update_id()).reverse());
if header.contains("text/html") { if header.contains("text/html") {
// We retrieve the database size. // We retrieve the database size.
let db_size = File::open(lmdb_path_cloned.clone()) let db_size =
.unwrap() File::open(lmdb_path_cloned.clone()).unwrap().metadata().unwrap().len()
.metadata() as usize;
.unwrap()
.len() as usize;
// And the number of documents in the database. // And the number of documents in the database.
let rtxn = index_cloned.read_txn().unwrap(); let rtxn = index_cloned.read_txn().unwrap();
let docs_count = index_cloned.clone().number_of_documents(&rtxn).unwrap() as usize; let docs_count = index_cloned.clone().number_of_documents(&rtxn).unwrap() as usize;
let template = UpdatesTemplate { let template =
db_name: db_name.clone(), UpdatesTemplate { db_name: db_name.clone(), db_size, docs_count, updates };
db_size,
docs_count,
updates,
};
Box::new(template) as Box<dyn warp::Reply> Box::new(template) as Box<dyn warp::Reply>
} else { } else {
Box::new(warp::reply::json(&updates)) Box::new(warp::reply::json(&updates))
} }
}); });
let dash_bulma_route = warp::filters::method::get() let dash_bulma_route =
.and(warp::path!("bulma.min.css")) warp::filters::method::get().and(warp::path!("bulma.min.css")).map(|| {
.map(|| Response::builder() Response::builder()
.header("content-type", "text/css; charset=utf-8") .header("content-type", "text/css; charset=utf-8")
.body(include_str!("../public/bulma.min.css")) .body(include_str!("../public/bulma.min.css"))
); });
let dash_bulma_dark_route = warp::filters::method::get() let dash_bulma_dark_route =
.and(warp::path!("bulma-prefers-dark.min.css")) warp::filters::method::get().and(warp::path!("bulma-prefers-dark.min.css")).map(|| {
.map(|| Response::builder() Response::builder()
.header("content-type", "text/css; charset=utf-8") .header("content-type", "text/css; charset=utf-8")
.body(include_str!("../public/bulma-prefers-dark.min.css")) .body(include_str!("../public/bulma-prefers-dark.min.css"))
); });
let dash_style_route = warp::filters::method::get() let dash_style_route = warp::filters::method::get().and(warp::path!("style.css")).map(|| {
.and(warp::path!("style.css")) Response::builder()
.map(|| Response::builder()
.header("content-type", "text/css; charset=utf-8") .header("content-type", "text/css; charset=utf-8")
.body(include_str!("../public/style.css")) .body(include_str!("../public/style.css"))
); });
let dash_jquery_route = warp::filters::method::get() let dash_jquery_route =
.and(warp::path!("jquery-3.4.1.min.js")) warp::filters::method::get().and(warp::path!("jquery-3.4.1.min.js")).map(|| {
.map(|| Response::builder() Response::builder()
.header("content-type", "application/javascript; charset=utf-8") .header("content-type", "application/javascript; charset=utf-8")
.body(include_str!("../public/jquery-3.4.1.min.js")) .body(include_str!("../public/jquery-3.4.1.min.js"))
); });
let dash_filesize_route = warp::filters::method::get() let dash_filesize_route =
.and(warp::path!("filesize.min.js")) warp::filters::method::get().and(warp::path!("filesize.min.js")).map(|| {
.map(|| Response::builder() Response::builder()
.header("content-type", "application/javascript; charset=utf-8") .header("content-type", "application/javascript; charset=utf-8")
.body(include_str!("../public/filesize.min.js")) .body(include_str!("../public/filesize.min.js"))
); });
let dash_script_route = warp::filters::method::get() let dash_script_route = warp::filters::method::get().and(warp::path!("script.js")).map(|| {
.and(warp::path!("script.js")) Response::builder()
.map(|| Response::builder()
.header("content-type", "application/javascript; charset=utf-8") .header("content-type", "application/javascript; charset=utf-8")
.body(include_str!("../public/script.js")) .body(include_str!("../public/script.js"))
); });
let updates_script_route = warp::filters::method::get() let updates_script_route =
.and(warp::path!("updates-script.js")) warp::filters::method::get().and(warp::path!("updates-script.js")).map(|| {
.map(|| Response::builder() Response::builder()
.header("content-type", "application/javascript; charset=utf-8") .header("content-type", "application/javascript; charset=utf-8")
.body(include_str!("../public/updates-script.js")) .body(include_str!("../public/updates-script.js"))
); });
let dash_logo_white_route = warp::filters::method::get() let dash_logo_white_route =
.and(warp::path!("logo-white.svg")) warp::filters::method::get().and(warp::path!("logo-white.svg")).map(|| {
.map(|| Response::builder() Response::builder()
.header("content-type", "image/svg+xml") .header("content-type", "image/svg+xml")
.body(include_str!("../public/logo-white.svg")) .body(include_str!("../public/logo-white.svg"))
); });
let dash_logo_black_route = warp::filters::method::get() let dash_logo_black_route =
.and(warp::path!("logo-black.svg")) warp::filters::method::get().and(warp::path!("logo-black.svg")).map(|| {
.map(|| Response::builder() Response::builder()
.header("content-type", "image/svg+xml") .header("content-type", "image/svg+xml")
.body(include_str!("../public/logo-black.svg")) .body(include_str!("../public/logo-black.svg"))
); });
#[derive(Debug, Deserialize)] #[derive(Debug, Deserialize)]
#[serde(untagged)] #[serde(untagged)]
@ -719,7 +735,8 @@ async fn main() -> anyhow::Result<()> {
search.filter(condition); search.filter(condition);
} }
let SearchResult { matching_words, candidates, documents_ids } = search.execute().unwrap(); let SearchResult { matching_words, candidates, documents_ids } =
search.execute().unwrap();
let number_of_candidates = candidates.len(); let number_of_candidates = candidates.len();
let facets = if query.facet_distribution == Some(true) { let facets = if query.facet_distribution == Some(true) {
@ -745,17 +762,18 @@ async fn main() -> anyhow::Result<()> {
for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() { for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() {
let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap(); let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap();
if !disable_highlighting { if !disable_highlighting {
highlighter.highlight_record(&mut object, &matching_words, &attributes_to_highlight); highlighter.highlight_record(
&mut object,
&matching_words,
&attributes_to_highlight,
);
} }
documents.push(object); documents.push(object);
} }
let answer = Answer { let answer =
documents, Answer { documents, number_of_candidates, facets: facets.unwrap_or_default() };
number_of_candidates,
facets: facets.unwrap_or_default(),
};
Response::builder() Response::builder()
.header("Content-Type", "application/json") .header("Content-Type", "application/json")
@ -764,9 +782,8 @@ async fn main() -> anyhow::Result<()> {
}); });
let index_cloned = index.clone(); let index_cloned = index.clone();
let document_route = warp::filters::method::get() let document_route = warp::filters::method::get().and(warp::path!("document" / String)).map(
.and(warp::path!("document" / String)) move |id: String| {
.map(move |id: String| {
let index = index_cloned.clone(); let index = index_cloned.clone();
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
@ -780,30 +797,31 @@ async fn main() -> anyhow::Result<()> {
match external_documents_ids.get(&id) { match external_documents_ids.get(&id) {
Some(document_id) => { Some(document_id) => {
let document_id = document_id as u32; let document_id = document_id as u32;
let (_, obkv) = index.documents(&rtxn, Some(document_id)).unwrap().pop().unwrap(); let (_, obkv) =
index.documents(&rtxn, Some(document_id)).unwrap().pop().unwrap();
let document = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap(); let document = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap();
Response::builder() Response::builder()
.header("Content-Type", "application/json") .header("Content-Type", "application/json")
.body(serde_json::to_string(&document).unwrap()) .body(serde_json::to_string(&document).unwrap())
} }
None => { None => Response::builder()
Response::builder()
.status(404) .status(404)
.body(format!("Document with id {:?} not found.", id)) .body(format!("Document with id {:?} not found.", id)),
} }
} },
}); );
async fn buf_stream( async fn buf_stream(
update_store: Arc<UpdateStore<UpdateMeta, String>>, update_store: Arc<UpdateStore<UpdateMeta, String>>,
update_status_sender: broadcast::Sender<UpdateStatus<UpdateMeta, UpdateMetaProgress, String>>, update_status_sender: broadcast::Sender<
UpdateStatus<UpdateMeta, UpdateMetaProgress, String>,
>,
update_method: Option<String>, update_method: Option<String>,
update_format: UpdateFormat, update_format: UpdateFormat,
encoding: Option<String>, encoding: Option<String>,
mut stream: impl futures::Stream<Item = Result<impl bytes::Buf, warp::Error>> + Unpin, mut stream: impl futures::Stream<Item = Result<impl bytes::Buf, warp::Error>> + Unpin,
) -> Result<impl warp::Reply, warp::Rejection> ) -> Result<impl warp::Reply, warp::Rejection> {
{
let file = tokio::task::block_in_place(tempfile::tempfile).unwrap(); let file = tokio::task::block_in_place(tempfile::tempfile).unwrap();
let mut file = TFile::from_std(file); let mut file = TFile::from_std(file);
@ -869,9 +887,8 @@ async fn main() -> anyhow::Result<()> {
let update_store_cloned = update_store.clone(); let update_store_cloned = update_store.clone();
let update_status_sender_cloned = update_status_sender.clone(); let update_status_sender_cloned = update_status_sender.clone();
let clearing_route = warp::filters::method::post() let clearing_route =
.and(warp::path!("clear-documents")) warp::filters::method::post().and(warp::path!("clear-documents")).map(move || {
.map(move || {
let meta = UpdateMeta::ClearDocuments; let meta = UpdateMeta::ClearDocuments;
let update_id = update_store_cloned.register_update(&meta, &[]).unwrap(); let update_id = update_store_cloned.register_update(&meta, &[]).unwrap();
let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta }); let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta });
@ -919,9 +936,8 @@ async fn main() -> anyhow::Result<()> {
let update_store_cloned = update_store.clone(); let update_store_cloned = update_store.clone();
let update_status_sender_cloned = update_status_sender.clone(); let update_status_sender_cloned = update_status_sender.clone();
let abort_pending_updates_route = warp::filters::method::delete() let abort_pending_updates_route =
.and(warp::path!("updates")) warp::filters::method::delete().and(warp::path!("updates")).map(move || {
.map(move || {
let updates = update_store_cloned.abort_pendings().unwrap(); let updates = update_store_cloned.abort_pendings().unwrap();
for (update_id, meta) in updates { for (update_id, meta) in updates {
let _ = update_status_sender_cloned.send(UpdateStatus::Aborted { update_id, meta }); let _ = update_status_sender_cloned.send(UpdateStatus::Aborted { update_id, meta });
@ -930,17 +946,15 @@ async fn main() -> anyhow::Result<()> {
warp::reply() warp::reply()
}); });
let update_ws_route = warp::ws() let update_ws_route =
.and(warp::path!("updates" / "ws")) warp::ws().and(warp::path!("updates" / "ws")).map(move |ws: warp::ws::Ws| {
.map(move |ws: warp::ws::Ws| {
// And then our closure will be called when it completes... // And then our closure will be called when it completes...
let update_status_receiver = update_status_sender.subscribe(); let update_status_receiver = update_status_sender.subscribe();
ws.on_upgrade(|websocket| { ws.on_upgrade(|websocket| {
// Just echo all updates messages... // Just echo all updates messages...
update_status_receiver update_status_receiver
.into_stream() .into_stream()
.flat_map(|result| { .flat_map(|result| match result {
match result {
Ok(status) => { Ok(status) => {
let msg = serde_json::to_string(&status).unwrap(); let msg = serde_json::to_string(&status).unwrap();
stream::iter(Some(Ok(Message::text(msg)))) stream::iter(Some(Ok(Message::text(msg))))
@ -949,7 +963,6 @@ async fn main() -> anyhow::Result<()> {
eprintln!("channel error: {:?}", e); eprintln!("channel error: {:?}", e);
stream::iter(None) stream::iter(None)
} }
}
}) })
.forward(websocket) .forward(websocket)
.map(|result| { .map(|result| {
@ -989,9 +1002,8 @@ async fn main() -> anyhow::Result<()> {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use maplit::{btreeset, hashmap, hashset}; use maplit::{btreeset, hashmap, hashset};
use serde_test::{assert_tokens, Token};
use milli::update::Setting; use milli::update::Setting;
use serde_test::{assert_tokens, Token};
use crate::Settings; use crate::Settings;
@ -1003,10 +1015,12 @@ mod tests {
filterable_attributes: Setting::Set(hashset! { "age".to_string() }), filterable_attributes: Setting::Set(hashset! { "age".to_string() }),
criteria: Setting::Set(vec!["asc(age)".to_string()]), criteria: Setting::Set(vec!["asc(age)".to_string()]),
stop_words: Setting::Set(btreeset! { "and".to_string() }), stop_words: Setting::Set(btreeset! { "and".to_string() }),
synonyms: Setting::Set(hashmap!{ "alex".to_string() => vec!["alexey".to_string()] }) synonyms: Setting::Set(hashmap! { "alex".to_string() => vec!["alexey".to_string()] }),
}; };
assert_tokens(&settings, &[ assert_tokens(
&settings,
&[
Token::Struct { name: "Settings", len: 6 }, Token::Struct { name: "Settings", len: 6 },
Token::Str("displayedAttributes"), Token::Str("displayedAttributes"),
Token::Some, Token::Some,
@ -1043,7 +1057,8 @@ mod tests {
Token::SeqEnd, Token::SeqEnd,
Token::MapEnd, Token::MapEnd,
Token::StructEnd, Token::StructEnd,
]); ],
);
} }
#[test] #[test]
@ -1057,7 +1072,9 @@ mod tests {
synonyms: Setting::Reset, synonyms: Setting::Reset,
}; };
assert_tokens(&settings, &[ assert_tokens(
&settings,
&[
Token::Struct { name: "Settings", len: 6 }, Token::Struct { name: "Settings", len: 6 },
Token::Str("displayedAttributes"), Token::Str("displayedAttributes"),
Token::None, Token::None,
@ -1072,7 +1089,8 @@ mod tests {
Token::Str("synonyms"), Token::Str("synonyms"),
Token::None, Token::None,
Token::StructEnd, Token::StructEnd,
]); ],
);
} }
#[test] #[test]
@ -1086,9 +1104,6 @@ mod tests {
synonyms: Setting::NotSet, synonyms: Setting::NotSet,
}; };
assert_tokens(&settings, &[ assert_tokens(&settings, &[Token::Struct { name: "Settings", len: 0 }, Token::StructEnd]);
Token::Struct { name: "Settings", len: 0 },
Token::StructEnd,
]);
} }
} }

View File

@ -4,9 +4,9 @@ use std::path::Path;
use std::sync::Arc; use std::sync::Arc;
use crossbeam_channel::Sender; use crossbeam_channel::Sender;
use heed::types::{OwnedType, DecodeIgnore, SerdeJson, ByteSlice}; use heed::types::{ByteSlice, DecodeIgnore, OwnedType, SerdeJson};
use heed::{EnvOpenOptions, Env, Database}; use heed::{Database, Env, EnvOpenOptions};
use serde::{Serialize, Deserialize}; use serde::{Deserialize, Serialize};
pub type BEU64 = heed::zerocopy::U64<heed::byteorder::BE>; pub type BEU64 = heed::zerocopy::U64<heed::byteorder::BE>;
@ -25,7 +25,9 @@ pub trait UpdateHandler<M, N> {
} }
impl<M, N, F> UpdateHandler<M, N> for F impl<M, N, F> UpdateHandler<M, N> for F
where F: FnMut(u64, M, &[u8]) -> heed::Result<N> + Send + 'static { where
F: FnMut(u64, M, &[u8]) -> heed::Result<N> + Send + 'static,
{
fn handle_update(&mut self, update_id: u64, meta: M, content: &[u8]) -> heed::Result<N> { fn handle_update(&mut self, update_id: u64, meta: M, content: &[u8]) -> heed::Result<N> {
self(update_id, meta, content) self(update_id, meta, content)
} }
@ -82,26 +84,17 @@ impl<M: 'static, N: 'static> UpdateStore<M, N> {
/// Returns the new biggest id to use to store the new update. /// Returns the new biggest id to use to store the new update.
fn new_update_id(&self, txn: &heed::RoTxn) -> heed::Result<u64> { fn new_update_id(&self, txn: &heed::RoTxn) -> heed::Result<u64> {
let last_pending = self.pending_meta let last_pending =
.remap_data_type::<DecodeIgnore>() self.pending_meta.remap_data_type::<DecodeIgnore>().last(txn)?.map(|(k, _)| k.get());
.last(txn)?
.map(|(k, _)| k.get());
let last_processed = self.processed_meta let last_processed =
.remap_data_type::<DecodeIgnore>() self.processed_meta.remap_data_type::<DecodeIgnore>().last(txn)?.map(|(k, _)| k.get());
.last(txn)?
.map(|(k, _)| k.get());
let last_aborted = self.aborted_meta let last_aborted =
.remap_data_type::<DecodeIgnore>() self.aborted_meta.remap_data_type::<DecodeIgnore>().last(txn)?.map(|(k, _)| k.get());
.last(txn)?
.map(|(k, _)| k.get());
let last_update_id = [last_pending, last_processed, last_aborted] let last_update_id =
.iter() [last_pending, last_processed, last_aborted].iter().copied().flatten().max();
.copied()
.flatten()
.max();
match last_update_id { match last_update_id {
Some(last_id) => Ok(last_id + 1), Some(last_id) => Ok(last_id + 1),
@ -112,7 +105,8 @@ impl<M: 'static, N: 'static> UpdateStore<M, N> {
/// Registers the update content in the pending store and the meta /// Registers the update content in the pending store and the meta
/// into the pending-meta store. Returns the new unique update id. /// into the pending-meta store. Returns the new unique update id.
pub fn register_update(&self, meta: &M, content: &[u8]) -> heed::Result<u64> pub fn register_update(&self, meta: &M, content: &[u8]) -> heed::Result<u64>
where M: Serialize, where
M: Serialize,
{ {
let mut wtxn = self.env.write_txn()?; let mut wtxn = self.env.write_txn()?;
@ -152,9 +146,8 @@ impl<M: 'static, N: 'static> UpdateStore<M, N> {
// a reader while processing it, not a writer. // a reader while processing it, not a writer.
match first_meta { match first_meta {
Some((first_id, first_meta)) => { Some((first_id, first_meta)) => {
let first_content = self.pending let first_content =
.get(&rtxn, &first_id)? self.pending.get(&rtxn, &first_id)?.expect("associated update content");
.expect("associated update content");
// Process the pending update using the provided user function. // Process the pending update using the provided user function.
let new_meta = handler.handle_update(first_id.get(), first_meta, first_content)?; let new_meta = handler.handle_update(first_id.get(), first_meta, first_content)?;
@ -170,15 +163,16 @@ impl<M: 'static, N: 'static> UpdateStore<M, N> {
wtxn.commit()?; wtxn.commit()?;
Ok(Some((first_id.get(), new_meta))) Ok(Some((first_id.get(), new_meta)))
}, }
None => Ok(None) None => Ok(None),
} }
} }
/// The id and metadata of the update that is currently being processed, /// The id and metadata of the update that is currently being processed,
/// `None` if no update is being processed. /// `None` if no update is being processed.
pub fn processing_update(&self) -> heed::Result<Option<(u64, M)>> pub fn processing_update(&self) -> heed::Result<Option<(u64, M)>>
where M: for<'a> Deserialize<'a>, where
M: for<'a> Deserialize<'a>,
{ {
let rtxn = self.env.read_txn()?; let rtxn = self.env.read_txn()?;
match self.pending_meta.first(&rtxn)? { match self.pending_meta.first(&rtxn)? {
@ -242,7 +236,8 @@ impl<M: 'static, N: 'static> UpdateStore<M, N> {
/// that as already been processed or which doesn't actually exist, will /// that as already been processed or which doesn't actually exist, will
/// return `None`. /// return `None`.
pub fn abort_update(&self, update_id: u64) -> heed::Result<Option<M>> pub fn abort_update(&self, update_id: u64) -> heed::Result<Option<M>>
where M: Serialize + for<'a> Deserialize<'a>, where
M: Serialize + for<'a> Deserialize<'a>,
{ {
let mut wtxn = self.env.write_txn()?; let mut wtxn = self.env.write_txn()?;
let key = BEU64::new(update_id); let key = BEU64::new(update_id);
@ -269,7 +264,8 @@ impl<M: 'static, N: 'static> UpdateStore<M, N> {
/// Aborts all the pending updates, and not the one being currently processed. /// Aborts all the pending updates, and not the one being currently processed.
/// Returns the update metas and ids that were successfully aborted. /// Returns the update metas and ids that were successfully aborted.
pub fn abort_pendings(&self) -> heed::Result<Vec<(u64, M)>> pub fn abort_pendings(&self) -> heed::Result<Vec<(u64, M)>>
where M: Serialize + for<'a> Deserialize<'a>, where
M: Serialize + for<'a> Deserialize<'a>,
{ {
let mut wtxn = self.env.write_txn()?; let mut wtxn = self.env.write_txn()?;
let mut aborted_updates = Vec::new(); let mut aborted_updates = Vec::new();
@ -303,17 +299,19 @@ pub enum UpdateStatusMeta<M, N> {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*;
use std::thread; use std::thread;
use std::time::{Duration, Instant}; use std::time::{Duration, Instant};
use super::*;
#[test] #[test]
fn simple() { fn simple() {
let dir = tempfile::tempdir().unwrap(); let dir = tempfile::tempdir().unwrap();
let options = EnvOpenOptions::new(); let options = EnvOpenOptions::new();
let update_store = UpdateStore::open(options, dir, |_id, meta: String, _content: &_| { let update_store = UpdateStore::open(options, dir, |_id, meta: String, _content: &_| {
Ok(meta + " processed") Ok(meta + " processed")
}).unwrap(); })
.unwrap();
let meta = String::from("kiki"); let meta = String::from("kiki");
let update_id = update_store.register_update(&meta, &[]).unwrap(); let update_id = update_store.register_update(&meta, &[]).unwrap();
@ -332,7 +330,8 @@ mod tests {
let update_store = UpdateStore::open(options, dir, |_id, meta: String, _content: &_| { let update_store = UpdateStore::open(options, dir, |_id, meta: String, _content: &_| {
thread::sleep(Duration::from_millis(400)); thread::sleep(Duration::from_millis(400));
Ok(meta + " processed") Ok(meta + " processed")
}).unwrap(); })
.unwrap();
let before_register = Instant::now(); let before_register = Instant::now();

View File

@ -1,16 +1,14 @@
use std::fmt::Write as _; use std::fmt::Write as _;
use std::path::PathBuf; use std::path::PathBuf;
use std::{str, io, fmt}; use std::{fmt, io, str};
use anyhow::Context; use anyhow::Context;
use byte_unit::Byte; use byte_unit::Byte;
use heed::EnvOpenOptions; use heed::EnvOpenOptions;
use structopt::StructOpt;
use milli::facet::FacetType; use milli::facet::FacetType;
use milli::index::db_name::*; use milli::index::db_name::*;
use milli::{Index, TreeLevel}; use milli::{Index, TreeLevel};
use structopt::StructOpt;
use Command::*; use Command::*;
#[cfg(target_os = "linux")] #[cfg(target_os = "linux")]
@ -257,53 +255,55 @@ fn main() -> anyhow::Result<()> {
WordsDocids { full_display, words } => words_docids(&index, &rtxn, !full_display, words), WordsDocids { full_display, words } => words_docids(&index, &rtxn, !full_display, words),
WordsPrefixesDocids { full_display, prefixes } => { WordsPrefixesDocids { full_display, prefixes } => {
words_prefixes_docids(&index, &rtxn, !full_display, prefixes) words_prefixes_docids(&index, &rtxn, !full_display, prefixes)
}, }
FacetNumbersDocids { full_display, field_name } => { FacetNumbersDocids { full_display, field_name } => {
facet_values_docids(&index, &rtxn, !full_display, FacetType::Number, field_name) facet_values_docids(&index, &rtxn, !full_display, FacetType::Number, field_name)
}, }
FacetStringsDocids { full_display, field_name } => { FacetStringsDocids { full_display, field_name } => {
facet_values_docids(&index, &rtxn, !full_display, FacetType::String, field_name) facet_values_docids(&index, &rtxn, !full_display, FacetType::String, field_name)
}, }
WordsLevelPositionsDocids { full_display, words } => { WordsLevelPositionsDocids { full_display, words } => {
words_level_positions_docids(&index, &rtxn, !full_display, words) words_level_positions_docids(&index, &rtxn, !full_display, words)
}, }
WordPrefixesLevelPositionsDocids { full_display, prefixes } => { WordPrefixesLevelPositionsDocids { full_display, prefixes } => {
word_prefixes_level_positions_docids(&index, &rtxn, !full_display, prefixes) word_prefixes_level_positions_docids(&index, &rtxn, !full_display, prefixes)
}, }
FieldIdWordCountDocids { full_display, field_name } => { FieldIdWordCountDocids { full_display, field_name } => {
field_id_word_count_docids(&index, &rtxn, !full_display, field_name) field_id_word_count_docids(&index, &rtxn, !full_display, field_name)
}, }
DocidsWordsPositions { full_display, internal_documents_ids } => { DocidsWordsPositions { full_display, internal_documents_ids } => {
docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids) docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids)
}, }
FacetNumberStats { field_name } => facet_number_stats(&index, &rtxn, field_name), FacetNumberStats { field_name } => facet_number_stats(&index, &rtxn, field_name),
AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn), AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn),
AverageNumberOfPositionsByWord => { AverageNumberOfPositionsByWord => average_number_of_positions_by_word(&index, &rtxn),
average_number_of_positions_by_word(&index, &rtxn)
},
SizeOfDatabase { databases } => size_of_databases(&index, &rtxn, databases), SizeOfDatabase { databases } => size_of_databases(&index, &rtxn, databases),
DatabaseStats { database } => database_stats(&index, &rtxn, &database), DatabaseStats { database } => database_stats(&index, &rtxn, &database),
WordPairProximitiesDocids { full_display, word1, word2 } => { WordPairProximitiesDocids { full_display, word1, word2 } => {
word_pair_proximities_docids(&index, &rtxn, !full_display, word1, word2) word_pair_proximities_docids(&index, &rtxn, !full_display, word1, word2)
}, }
ExportWordsFst => export_words_fst(&index, &rtxn), ExportWordsFst => export_words_fst(&index, &rtxn),
ExportWordsPrefixFst => export_words_prefix_fst(&index, &rtxn), ExportWordsPrefixFst => export_words_prefix_fst(&index, &rtxn),
ExportDocuments { internal_documents_ids } => { ExportDocuments { internal_documents_ids } => {
export_documents(&index, &rtxn, internal_documents_ids) export_documents(&index, &rtxn, internal_documents_ids)
}, }
} }
} }
fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> { fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> {
use std::collections::BinaryHeap;
use std::cmp::Reverse; use std::cmp::Reverse;
use std::collections::BinaryHeap;
let mut heap = BinaryHeap::with_capacity(limit + 1); let mut heap = BinaryHeap::with_capacity(limit + 1);
for result in index.word_docids.iter(rtxn)? { for result in index.word_docids.iter(rtxn)? {
if limit == 0 { break } if limit == 0 {
break;
}
let (word, docids) = result?; let (word, docids) = result?;
heap.push((Reverse(docids.len()), word)); heap.push((Reverse(docids.len()), word));
if heap.len() > limit { heap.pop(); } if heap.len() > limit {
heap.pop();
}
} }
let stdout = io::stdout(); let stdout = io::stdout();
@ -347,7 +347,8 @@ fn facet_number_value_to_string<T: fmt::Debug>(level: u8, left: T, right: T) ->
fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> { fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> {
use std::cmp::Reverse; use std::cmp::Reverse;
use std::collections::BinaryHeap; use std::collections::BinaryHeap;
use heed::types::{Str, ByteSlice};
use heed::types::{ByteSlice, Str};
let Index { let Index {
env: _env, env: _env,
@ -387,71 +388,93 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
let words_fst = index.words_fst(rtxn)?; let words_fst = index.words_fst(rtxn)?;
let length = words_fst.as_fst().as_bytes().len(); let length = words_fst.as_fst().as_bytes().len();
heap.push(Reverse((length, format!("words-fst"), main_name))); heap.push(Reverse((length, format!("words-fst"), main_name)));
if heap.len() > limit { heap.pop(); } if heap.len() > limit {
heap.pop();
}
// Fetch the word prefix FST // Fetch the word prefix FST
let words_prefixes_fst = index.words_prefixes_fst(rtxn)?; let words_prefixes_fst = index.words_prefixes_fst(rtxn)?;
let length = words_prefixes_fst.as_fst().as_bytes().len(); let length = words_prefixes_fst.as_fst().as_bytes().len();
heap.push(Reverse((length, format!("words-prefixes-fst"), main_name))); heap.push(Reverse((length, format!("words-prefixes-fst"), main_name)));
if heap.len() > limit { heap.pop(); } if heap.len() > limit {
heap.pop();
}
if let Some(documents_ids) = main.get::<_, Str, ByteSlice>(rtxn, "documents-ids")? { if let Some(documents_ids) = main.get::<_, Str, ByteSlice>(rtxn, "documents-ids")? {
heap.push(Reverse((documents_ids.len(), format!("documents-ids"), main_name))); heap.push(Reverse((documents_ids.len(), format!("documents-ids"), main_name)));
if heap.len() > limit { heap.pop(); } if heap.len() > limit {
heap.pop();
}
} }
for result in word_docids.remap_data_type::<ByteSlice>().iter(rtxn)? { for result in word_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
let (word, value) = result?; let (word, value) = result?;
heap.push(Reverse((value.len(), word.to_string(), word_docids_name))); heap.push(Reverse((value.len(), word.to_string(), word_docids_name)));
if heap.len() > limit { heap.pop(); } if heap.len() > limit {
heap.pop();
}
} }
for result in word_prefix_docids.remap_data_type::<ByteSlice>().iter(rtxn)? { for result in word_prefix_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
let (word, value) = result?; let (word, value) = result?;
heap.push(Reverse((value.len(), word.to_string(), word_prefix_docids_name))); heap.push(Reverse((value.len(), word.to_string(), word_prefix_docids_name)));
if heap.len() > limit { heap.pop(); } if heap.len() > limit {
heap.pop();
}
} }
for result in docid_word_positions.remap_data_type::<ByteSlice>().iter(rtxn)? { for result in docid_word_positions.remap_data_type::<ByteSlice>().iter(rtxn)? {
let ((docid, word), value) = result?; let ((docid, word), value) = result?;
let key = format!("{} {}", docid, word); let key = format!("{} {}", docid, word);
heap.push(Reverse((value.len(), key, docid_word_positions_name))); heap.push(Reverse((value.len(), key, docid_word_positions_name)));
if heap.len() > limit { heap.pop(); } if heap.len() > limit {
heap.pop();
}
} }
for result in word_pair_proximity_docids.remap_data_type::<ByteSlice>().iter(rtxn)? { for result in word_pair_proximity_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
let ((word1, word2, prox), value) = result?; let ((word1, word2, prox), value) = result?;
let key = format!("{} {} {}", word1, word2, prox); let key = format!("{} {} {}", word1, word2, prox);
heap.push(Reverse((value.len(), key, word_pair_proximity_docids_name))); heap.push(Reverse((value.len(), key, word_pair_proximity_docids_name)));
if heap.len() > limit { heap.pop(); } if heap.len() > limit {
heap.pop();
}
} }
for result in word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>().iter(rtxn)? { for result in word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
let ((word, prefix, prox), value) = result?; let ((word, prefix, prox), value) = result?;
let key = format!("{} {} {}", word, prefix, prox); let key = format!("{} {} {}", word, prefix, prox);
heap.push(Reverse((value.len(), key, word_prefix_pair_proximity_docids_name))); heap.push(Reverse((value.len(), key, word_prefix_pair_proximity_docids_name)));
if heap.len() > limit { heap.pop(); } if heap.len() > limit {
heap.pop();
}
} }
for result in word_level_position_docids.remap_data_type::<ByteSlice>().iter(rtxn)? { for result in word_level_position_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
let ((word, level, left, right), value) = result?; let ((word, level, left, right), value) = result?;
let key = format!("{} {} {:?}", word, level, left..=right); let key = format!("{} {} {:?}", word, level, left..=right);
heap.push(Reverse((value.len(), key, word_level_position_docids_name))); heap.push(Reverse((value.len(), key, word_level_position_docids_name)));
if heap.len() > limit { heap.pop(); } if heap.len() > limit {
heap.pop();
}
} }
for result in word_prefix_level_position_docids.remap_data_type::<ByteSlice>().iter(rtxn)? { for result in word_prefix_level_position_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
let ((word, level, left, right), value) = result?; let ((word, level, left, right), value) = result?;
let key = format!("{} {} {:?}", word, level, left..=right); let key = format!("{} {} {:?}", word, level, left..=right);
heap.push(Reverse((value.len(), key, word_prefix_level_position_docids_name))); heap.push(Reverse((value.len(), key, word_prefix_level_position_docids_name)));
if heap.len() > limit { heap.pop(); } if heap.len() > limit {
heap.pop();
}
} }
for result in field_id_word_count_docids.remap_data_type::<ByteSlice>().iter(rtxn)? { for result in field_id_word_count_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
let ((field_id, word_count), docids) = result?; let ((field_id, word_count), docids) = result?;
let key = format!("{} {}", field_id, word_count); let key = format!("{} {}", field_id, word_count);
heap.push(Reverse((docids.len(), key, field_id_word_count_docids_name))); heap.push(Reverse((docids.len(), key, field_id_word_count_docids_name)));
if heap.len() > limit { heap.pop(); } if heap.len() > limit {
heap.pop();
}
} }
let faceted_fields = index.faceted_fields_ids(rtxn)?; let faceted_fields = index.faceted_fields_ids(rtxn)?;
@ -468,7 +491,9 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
write!(&mut output, " (level {})", level)?; write!(&mut output, " (level {})", level)?;
let key = format!("{} {}", facet_name, output); let key = format!("{} {}", facet_name, output);
heap.push(Reverse((value.len(), key, facet_id_f64_docids_name))); heap.push(Reverse((value.len(), key, facet_id_f64_docids_name)));
if heap.len() > limit { heap.pop(); } if heap.len() > limit {
heap.pop();
}
} }
// List the facet strings of this facet id. // List the facet strings of this facet id.
@ -477,14 +502,18 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
let ((_fid, fvalue), value) = result?; let ((_fid, fvalue), value) = result?;
let key = format!("{} {}", facet_name, fvalue); let key = format!("{} {}", facet_name, fvalue);
heap.push(Reverse((value.len(), key, facet_id_string_docids_name))); heap.push(Reverse((value.len(), key, facet_id_string_docids_name)));
if heap.len() > limit { heap.pop(); } if heap.len() > limit {
heap.pop();
}
} }
} }
for result in documents.remap_data_type::<ByteSlice>().iter(rtxn)? { for result in documents.remap_data_type::<ByteSlice>().iter(rtxn)? {
let (id, value) = result?; let (id, value) = result?;
heap.push(Reverse((value.len(), id.to_string(), documents_name))); heap.push(Reverse((value.len(), id.to_string(), documents_name)));
if heap.len() > limit { heap.pop(); } if heap.len() > limit {
heap.pop();
}
} }
} }
@ -499,7 +528,12 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
Ok(wtr.flush()?) Ok(wtr.flush()?)
} }
fn words_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, words: Vec<String>) -> anyhow::Result<()> { fn words_docids(
index: &Index,
rtxn: &heed::RoTxn,
debug: bool,
words: Vec<String>,
) -> anyhow::Result<()> {
let stdout = io::stdout(); let stdout = io::stdout();
let mut wtr = csv::Writer::from_writer(stdout.lock()); let mut wtr = csv::Writer::from_writer(stdout.lock());
wtr.write_record(&["word", "documents_ids"])?; wtr.write_record(&["word", "documents_ids"])?;
@ -523,8 +557,7 @@ fn words_prefixes_docids(
rtxn: &heed::RoTxn, rtxn: &heed::RoTxn,
debug: bool, debug: bool,
prefixes: Vec<String>, prefixes: Vec<String>,
) -> anyhow::Result<()> ) -> anyhow::Result<()> {
{
let stdout = io::stdout(); let stdout = io::stdout();
let mut wtr = csv::Writer::from_writer(stdout.lock()); let mut wtr = csv::Writer::from_writer(stdout.lock());
wtr.write_record(&["prefix", "documents_ids"])?; wtr.write_record(&["prefix", "documents_ids"])?;
@ -561,12 +594,12 @@ fn facet_values_docids(
debug: bool, debug: bool,
facet_type: FacetType, facet_type: FacetType,
field_name: String, field_name: String,
) -> anyhow::Result<()> ) -> anyhow::Result<()> {
{
let fields_ids_map = index.fields_ids_map(&rtxn)?; let fields_ids_map = index.fields_ids_map(&rtxn)?;
let faceted_fields = index.faceted_fields_ids(&rtxn)?; let faceted_fields = index.faceted_fields_ids(&rtxn)?;
let field_id = fields_ids_map.id(&field_name) let field_id = fields_ids_map
.id(&field_name)
.with_context(|| format!("field {} not found", field_name))?; .with_context(|| format!("field {} not found", field_name))?;
if !faceted_fields.contains(&field_id) { if !faceted_fields.contains(&field_id) {
@ -590,7 +623,7 @@ fn facet_values_docids(
}; };
wtr.write_record(&[value, level.to_string(), count.to_string(), docids])?; wtr.write_record(&[value, level.to_string(), count.to_string(), docids])?;
} }
}, }
FacetType::String => { FacetType::String => {
wtr.write_record(&["facet_string", "documents_count", "documents_ids"])?; wtr.write_record(&["facet_string", "documents_count", "documents_ids"])?;
for result in facet_values_iter(rtxn, index.facet_id_string_docids, field_id)? { for result in facet_values_iter(rtxn, index.facet_id_string_docids, field_id)? {
@ -614,8 +647,7 @@ fn words_level_positions_docids(
rtxn: &heed::RoTxn, rtxn: &heed::RoTxn,
debug: bool, debug: bool,
words: Vec<String>, words: Vec<String>,
) -> anyhow::Result<()> ) -> anyhow::Result<()> {
{
let stdout = io::stdout(); let stdout = io::stdout();
let mut wtr = csv::Writer::from_writer(stdout.lock()); let mut wtr = csv::Writer::from_writer(stdout.lock());
wtr.write_record(&["word", "level", "positions", "documents_count", "documents_ids"])?; wtr.write_record(&["word", "level", "positions", "documents_count", "documents_ids"])?;
@ -653,8 +685,7 @@ fn word_prefixes_level_positions_docids(
rtxn: &heed::RoTxn, rtxn: &heed::RoTxn,
debug: bool, debug: bool,
prefixes: Vec<String>, prefixes: Vec<String>,
) -> anyhow::Result<()> ) -> anyhow::Result<()> {
{
let stdout = io::stdout(); let stdout = io::stdout();
let mut wtr = csv::Writer::from_writer(stdout.lock()); let mut wtr = csv::Writer::from_writer(stdout.lock());
wtr.write_record(&["prefix", "level", "positions", "documents_count", "documents_ids"])?; wtr.write_record(&["prefix", "level", "positions", "documents_count", "documents_ids"])?;
@ -691,21 +722,20 @@ fn field_id_word_count_docids(
index: &Index, index: &Index,
rtxn: &heed::RoTxn, rtxn: &heed::RoTxn,
debug: bool, debug: bool,
field_name: String field_name: String,
) -> anyhow::Result<()> ) -> anyhow::Result<()> {
{
let stdout = io::stdout(); let stdout = io::stdout();
let mut wtr = csv::Writer::from_writer(stdout.lock()); let mut wtr = csv::Writer::from_writer(stdout.lock());
wtr.write_record(&["field_name", "word_count", "docids"])?; wtr.write_record(&["field_name", "word_count", "docids"])?;
let field_id = index.fields_ids_map(rtxn)? let field_id = index
.fields_ids_map(rtxn)?
.id(&field_name) .id(&field_name)
.with_context(|| format!("unknown field name: {}", &field_name))?; .with_context(|| format!("unknown field name: {}", &field_name))?;
let left = (field_id, 0); let left = (field_id, 0);
let right = (field_id, u8::max_value()); let right = (field_id, u8::max_value());
let iter = index.field_id_word_count_docids let iter = index.field_id_word_count_docids.range(rtxn, &(left..=right))?;
.range(rtxn, &(left..=right))?;
for result in iter { for result in iter {
let ((_, word_count), docids) = result?; let ((_, word_count), docids) = result?;
@ -725,8 +755,7 @@ fn docids_words_positions(
rtxn: &heed::RoTxn, rtxn: &heed::RoTxn,
debug: bool, debug: bool,
internal_ids: Vec<u32>, internal_ids: Vec<u32>,
) -> anyhow::Result<()> ) -> anyhow::Result<()> {
{
let stdout = io::stdout(); let stdout = io::stdout();
let mut wtr = csv::Writer::from_writer(stdout.lock()); let mut wtr = csv::Writer::from_writer(stdout.lock());
wtr.write_record(&["document_id", "word", "positions"])?; wtr.write_record(&["document_id", "word", "positions"])?;
@ -734,9 +763,10 @@ fn docids_words_positions(
let iter: Box<dyn Iterator<Item = _>> = if internal_ids.is_empty() { let iter: Box<dyn Iterator<Item = _>> = if internal_ids.is_empty() {
Box::new(index.docid_word_positions.iter(rtxn)?) Box::new(index.docid_word_positions.iter(rtxn)?)
} else { } else {
let vec: heed::Result<Vec<_>> = internal_ids.into_iter().map(|id| { let vec: heed::Result<Vec<_>> = internal_ids
index.docid_word_positions.prefix_iter(rtxn, &(id, "")) .into_iter()
}).collect(); .map(|id| index.docid_word_positions.prefix_iter(rtxn, &(id, "")))
.collect();
Box::new(vec?.into_iter().flatten()) Box::new(vec?.into_iter().flatten())
}; };
@ -757,7 +787,8 @@ fn facet_number_stats(index: &Index, rtxn: &heed::RoTxn, field_name: String) ->
let fields_ids_map = index.fields_ids_map(&rtxn)?; let fields_ids_map = index.fields_ids_map(&rtxn)?;
let faceted_fields = index.faceted_fields_ids(&rtxn)?; let faceted_fields = index.faceted_fields_ids(&rtxn)?;
let field_id = fields_ids_map.id(&field_name) let field_id = fields_ids_map
.id(&field_name)
.with_context(|| format!("field {} not found", field_name))?; .with_context(|| format!("field {} not found", field_name))?;
if !faceted_fields.contains(&field_id) { if !faceted_fields.contains(&field_id) {
@ -808,9 +839,14 @@ fn export_words_prefix_fst(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<
Ok(()) Ok(())
} }
fn export_documents(index: &Index, rtxn: &heed::RoTxn, internal_ids: Vec<u32>) -> anyhow::Result<()> { fn export_documents(
index: &Index,
rtxn: &heed::RoTxn,
internal_ids: Vec<u32>,
) -> anyhow::Result<()> {
use std::io::{BufWriter, Write as _}; use std::io::{BufWriter, Write as _};
use milli::{BEU32, obkv_to_json};
use milli::{obkv_to_json, BEU32};
let stdout = io::stdout(); let stdout = io::stdout();
let mut out = BufWriter::new(stdout); let mut out = BufWriter::new(stdout);
@ -819,13 +855,13 @@ fn export_documents(index: &Index, rtxn: &heed::RoTxn, internal_ids: Vec<u32>) -
let displayed_fields: Vec<_> = fields_ids_map.iter().map(|(id, _name)| id).collect(); let displayed_fields: Vec<_> = fields_ids_map.iter().map(|(id, _name)| id).collect();
let iter: Box<dyn Iterator<Item = _>> = if internal_ids.is_empty() { let iter: Box<dyn Iterator<Item = _>> = if internal_ids.is_empty() {
Box::new(index.documents.iter(rtxn)?.map(|result| { Box::new(index.documents.iter(rtxn)?.map(|result| result.map(|(_id, obkv)| obkv)))
result.map(|(_id, obkv)| obkv)
}))
} else { } else {
Box::new(internal_ids.into_iter().flat_map(|id| { Box::new(
index.documents.get(rtxn, &BEU32::new(id)).transpose() internal_ids
})) .into_iter()
.flat_map(|id| index.documents.get(rtxn, &BEU32::new(id)).transpose()),
)
}; };
for result in iter { for result in iter {
@ -842,26 +878,27 @@ fn export_documents(index: &Index, rtxn: &heed::RoTxn, internal_ids: Vec<u32>) -
fn average_number_of_words_by_doc(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> { fn average_number_of_words_by_doc(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> {
use heed::types::DecodeIgnore; use heed::types::DecodeIgnore;
use milli::{DocumentId, BEU32StrCodec}; use milli::{BEU32StrCodec, DocumentId};
let mut words_counts = Vec::new(); let mut words_counts = Vec::new();
let mut count = 0; let mut count = 0;
let mut prev = None as Option<(DocumentId, u32)>; let mut prev = None as Option<(DocumentId, u32)>;
let iter = index.docid_word_positions.as_polymorph().iter::<_, BEU32StrCodec, DecodeIgnore>(rtxn)?; let iter =
index.docid_word_positions.as_polymorph().iter::<_, BEU32StrCodec, DecodeIgnore>(rtxn)?;
for result in iter { for result in iter {
let ((docid, _word), ()) = result?; let ((docid, _word), ()) = result?;
match prev.as_mut() { match prev.as_mut() {
Some((prev_docid, prev_count)) if docid == *prev_docid => { Some((prev_docid, prev_count)) if docid == *prev_docid => {
*prev_count += 1; *prev_count += 1;
}, }
Some((prev_docid, prev_count)) => { Some((prev_docid, prev_count)) => {
words_counts.push(*prev_count); words_counts.push(*prev_count);
*prev_docid = docid; *prev_docid = docid;
*prev_count = 0; *prev_count = 0;
count += 1; count += 1;
}, }
None => prev = Some((docid, 1)), None => prev = Some((docid, 1)),
} }
} }
@ -970,16 +1007,15 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a
fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Result<()> { fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Result<()> {
use heed::types::ByteSlice; use heed::types::ByteSlice;
use heed::{Error, BytesDecode}; use heed::{BytesDecode, Error};
use roaring::RoaringBitmap;
use milli::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec}; use milli::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec};
use roaring::RoaringBitmap;
fn compute_stats<'a, DC: BytesDecode<'a, DItem = RoaringBitmap>>( fn compute_stats<'a, DC: BytesDecode<'a, DItem = RoaringBitmap>>(
db: heed::PolyDatabase, db: heed::PolyDatabase,
rtxn: &'a heed::RoTxn, rtxn: &'a heed::RoTxn,
name: &str, name: &str,
) -> anyhow::Result<()> ) -> anyhow::Result<()> {
{
let mut key_size = 0u64; let mut key_size = 0u64;
let mut val_size = 0u64; let mut val_size = 0u64;
let mut values_length = Vec::new(); let mut values_length = Vec::new();
@ -1028,27 +1064,27 @@ fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Resu
WORD_DOCIDS => { WORD_DOCIDS => {
let db = index.word_docids.as_polymorph(); let db = index.word_docids.as_polymorph();
compute_stats::<RoaringBitmapCodec>(*db, rtxn, name) compute_stats::<RoaringBitmapCodec>(*db, rtxn, name)
}, }
WORD_PREFIX_DOCIDS => { WORD_PREFIX_DOCIDS => {
let db = index.word_prefix_docids.as_polymorph(); let db = index.word_prefix_docids.as_polymorph();
compute_stats::<RoaringBitmapCodec>(*db, rtxn, name) compute_stats::<RoaringBitmapCodec>(*db, rtxn, name)
}, }
DOCID_WORD_POSITIONS => { DOCID_WORD_POSITIONS => {
let db = index.docid_word_positions.as_polymorph(); let db = index.docid_word_positions.as_polymorph();
compute_stats::<BoRoaringBitmapCodec>(*db, rtxn, name) compute_stats::<BoRoaringBitmapCodec>(*db, rtxn, name)
}, }
WORD_PAIR_PROXIMITY_DOCIDS => { WORD_PAIR_PROXIMITY_DOCIDS => {
let db = index.word_pair_proximity_docids.as_polymorph(); let db = index.word_pair_proximity_docids.as_polymorph();
compute_stats::<CboRoaringBitmapCodec>(*db, rtxn, name) compute_stats::<CboRoaringBitmapCodec>(*db, rtxn, name)
}, }
WORD_PREFIX_PAIR_PROXIMITY_DOCIDS => { WORD_PREFIX_PAIR_PROXIMITY_DOCIDS => {
let db = index.word_prefix_pair_proximity_docids.as_polymorph(); let db = index.word_prefix_pair_proximity_docids.as_polymorph();
compute_stats::<CboRoaringBitmapCodec>(*db, rtxn, name) compute_stats::<CboRoaringBitmapCodec>(*db, rtxn, name)
}, }
FIELD_ID_WORD_COUNT_DOCIDS => { FIELD_ID_WORD_COUNT_DOCIDS => {
let db = index.field_id_word_count_docids.as_polymorph(); let db = index.field_id_word_count_docids.as_polymorph();
compute_stats::<CboRoaringBitmapCodec>(*db, rtxn, name) compute_stats::<CboRoaringBitmapCodec>(*db, rtxn, name)
}, }
unknown => anyhow::bail!("unknown database {:?}", unknown), unknown => anyhow::bail!("unknown database {:?}", unknown),
} }
} }
@ -1059,8 +1095,7 @@ fn word_pair_proximities_docids(
debug: bool, debug: bool,
word1: String, word1: String,
word2: String, word2: String,
) -> anyhow::Result<()> ) -> anyhow::Result<()> {
{
use heed::types::ByteSlice; use heed::types::ByteSlice;
use milli::RoaringBitmapCodec; use milli::RoaringBitmapCodec;
@ -1081,7 +1116,9 @@ fn word_pair_proximities_docids(
// Skip keys that are longer than the requested one, // Skip keys that are longer than the requested one,
// a longer key means that the second word is a prefix of the request word. // a longer key means that the second word is a prefix of the request word.
if key.len() != prefix.len() + 1 { continue; } if key.len() != prefix.len() + 1 {
continue;
}
let proximity = key.last().unwrap(); let proximity = key.last().unwrap();
let docids = if debug { let docids = if debug {

View File

@ -1,15 +1,14 @@
use std::fmt; use std::fmt;
use std::str::FromStr; use std::str::FromStr;
use regex::Regex;
use serde::{Serialize, Deserialize};
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use regex::Regex;
use serde::{Deserialize, Serialize};
use crate::error::{Error, UserError}; use crate::error::{Error, UserError};
static ASC_DESC_REGEX: Lazy<Regex> = Lazy::new(|| { static ASC_DESC_REGEX: Lazy<Regex> =
Regex::new(r#"(asc|desc)\(([\w_-]+)\)"#).unwrap() Lazy::new(|| Regex::new(r#"(asc|desc)\(([\w_-]+)\)"#).unwrap());
});
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
pub enum Criterion { pub enum Criterion {
@ -52,17 +51,21 @@ impl FromStr for Criterion {
"attribute" => Ok(Criterion::Attribute), "attribute" => Ok(Criterion::Attribute),
"exactness" => Ok(Criterion::Exactness), "exactness" => Ok(Criterion::Exactness),
text => { text => {
let caps = ASC_DESC_REGEX.captures(text).ok_or_else(|| { let caps = ASC_DESC_REGEX
UserError::InvalidCriterionName { name: text.to_string() } .captures(text)
})?; .ok_or_else(|| UserError::InvalidCriterionName { name: text.to_string() })?;
let order = caps.get(1).unwrap().as_str(); let order = caps.get(1).unwrap().as_str();
let field_name = caps.get(2).unwrap().as_str(); let field_name = caps.get(2).unwrap().as_str();
match order { match order {
"asc" => Ok(Criterion::Asc(field_name.to_string())), "asc" => Ok(Criterion::Asc(field_name.to_string())),
"desc" => Ok(Criterion::Desc(field_name.to_string())), "desc" => Ok(Criterion::Desc(field_name.to_string())),
text => return Err(UserError::InvalidCriterionName { name: text.to_string() }.into()), text => {
return Err(
UserError::InvalidCriterionName { name: text.to_string() }.into()
)
}
}
} }
},
} }
} }
} }

View File

@ -2,7 +2,7 @@ use std::convert::Infallible;
use std::error::Error as StdError; use std::error::Error as StdError;
use std::{fmt, io, str}; use std::{fmt, io, str};
use heed::{MdbError, Error as HeedError}; use heed::{Error as HeedError, MdbError};
use rayon::ThreadPoolBuildError; use rayon::ThreadPoolBuildError;
use serde_json::{Map, Value}; use serde_json::{Map, Value};
@ -80,14 +80,17 @@ impl From<fst::Error> for Error {
} }
} }
impl<E> From<grenad::Error<E>> for Error where Error: From<E> { impl<E> From<grenad::Error<E>> for Error
where
Error: From<E>,
{
fn from(error: grenad::Error<E>) -> Error { fn from(error: grenad::Error<E>) -> Error {
match error { match error {
grenad::Error::Io(error) => Error::IoError(error), grenad::Error::Io(error) => Error::IoError(error),
grenad::Error::Merge(error) => Error::from(error), grenad::Error::Merge(error) => Error::from(error),
grenad::Error::InvalidCompressionType => { grenad::Error::InvalidCompressionType => {
Error::InternalError(InternalError::GrenadInvalidCompressionType) Error::InternalError(InternalError::GrenadInvalidCompressionType)
}, }
} }
} }
} }
@ -171,15 +174,15 @@ impl fmt::Display for InternalError {
match self { match self {
Self::DatabaseMissingEntry { db_name, key } => { Self::DatabaseMissingEntry { db_name, key } => {
write!(f, "missing {} in the {} database", key.unwrap_or("key"), db_name) write!(f, "missing {} in the {} database", key.unwrap_or("key"), db_name)
}, }
Self::FieldIdMapMissingEntry(error) => error.fmt(f), Self::FieldIdMapMissingEntry(error) => error.fmt(f),
Self::Fst(error) => error.fmt(f), Self::Fst(error) => error.fmt(f),
Self::GrenadInvalidCompressionType => { Self::GrenadInvalidCompressionType => {
f.write_str("invalid compression type have been specified to grenad") f.write_str("invalid compression type have been specified to grenad")
}, }
Self::IndexingMergingKeys { process } => { Self::IndexingMergingKeys { process } => {
write!(f, "invalid merge while processing {}", process) write!(f, "invalid merge while processing {}", process)
}, }
Self::Serialization(error) => error.fmt(f), Self::Serialization(error) => error.fmt(f),
Self::InvalidDatabaseTyping => HeedError::InvalidDatabaseTyping.fmt(f), Self::InvalidDatabaseTyping => HeedError::InvalidDatabaseTyping.fmt(f),
Self::RayonThreadPool(error) => error.fmt(f), Self::RayonThreadPool(error) => error.fmt(f),
@ -204,12 +207,12 @@ impl fmt::Display for UserError {
Self::InvalidDocumentId { document_id } => { Self::InvalidDocumentId { document_id } => {
let json = serde_json::to_string(document_id).unwrap(); let json = serde_json::to_string(document_id).unwrap();
write!(f, "document identifier is invalid {}", json) write!(f, "document identifier is invalid {}", json)
}, }
Self::InvalidFilterAttribute(error) => error.fmt(f), Self::InvalidFilterAttribute(error) => error.fmt(f),
Self::MissingDocumentId { document } => { Self::MissingDocumentId { document } => {
let json = serde_json::to_string(document).unwrap(); let json = serde_json::to_string(document).unwrap();
write!(f, "document doesn't have an identifier {}", json) write!(f, "document doesn't have an identifier {}", json)
}, }
Self::MissingPrimaryKey => f.write_str("missing primary key"), Self::MissingPrimaryKey => f.write_str("missing primary key"),
Self::MaxDatabaseSizeReached => f.write_str("maximum database size reached"), Self::MaxDatabaseSizeReached => f.write_str("maximum database size reached"),
// TODO where can we find it instead of writing the text ourselves? // TODO where can we find it instead of writing the text ourselves?
@ -217,14 +220,14 @@ impl fmt::Display for UserError {
Self::InvalidStoreFile => f.write_str("store file is not a valid database file"), Self::InvalidStoreFile => f.write_str("store file is not a valid database file"),
Self::PrimaryKeyCannotBeChanged => { Self::PrimaryKeyCannotBeChanged => {
f.write_str("primary key cannot be changed if the database contains documents") f.write_str("primary key cannot be changed if the database contains documents")
}, }
Self::PrimaryKeyCannotBeReset => { Self::PrimaryKeyCannotBeReset => {
f.write_str("primary key cannot be reset if the database contains documents") f.write_str("primary key cannot be reset if the database contains documents")
}, }
Self::SerdeJson(error) => error.fmt(f), Self::SerdeJson(error) => error.fmt(f),
Self::UnknownInternalDocumentId { document_id } => { Self::UnknownInternalDocumentId { document_id } => {
write!(f, "an unknown internal document id have been used ({})", document_id) write!(f, "an unknown internal document id have been used ({})", document_id)
}, }
} }
} }
} }
@ -236,10 +239,10 @@ impl fmt::Display for FieldIdMapMissingEntry {
match self { match self {
Self::FieldId { field_id, process } => { Self::FieldId { field_id, process } => {
write!(f, "unknown field id {} coming from the {} process", field_id, process) write!(f, "unknown field id {} coming from the {} process", field_id, process)
}, }
Self::FieldName { field_name, process } => { Self::FieldName { field_name, process } => {
write!(f, "unknown field name {} coming from the {} process", field_name, process) write!(f, "unknown field name {} coming from the {} process", field_name, process)
}, }
} }
} }
} }
@ -251,11 +254,11 @@ impl fmt::Display for SerializationError {
match self { match self {
Self::Decoding { db_name: Some(name) } => { Self::Decoding { db_name: Some(name) } => {
write!(f, "decoding from the {} database failed", name) write!(f, "decoding from the {} database failed", name)
}, }
Self::Decoding { db_name: None } => f.write_str("decoding failed"), Self::Decoding { db_name: None } => f.write_str("decoding failed"),
Self::Encoding { db_name: Some(name) } => { Self::Encoding { db_name: Some(name) } => {
write!(f, "encoding into the {} database failed", name) write!(f, "encoding into the {} database failed", name)
}, }
Self::Encoding { db_name: None } => f.write_str("encoding failed"), Self::Encoding { db_name: None } => f.write_str("encoding failed"),
Self::InvalidNumberSerialization => f.write_str("number is not a valid finite number"), Self::InvalidNumberSerialization => f.write_str("number is not a valid finite number"),
} }

View File

@ -1,6 +1,7 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::convert::TryInto; use std::convert::TryInto;
use fst::{Streamer, IntoStreamer};
use fst::{IntoStreamer, Streamer};
pub struct ExternalDocumentsIds<'a> { pub struct ExternalDocumentsIds<'a> {
pub(crate) hard: fst::Map<Cow<'a, [u8]>>, pub(crate) hard: fst::Map<Cow<'a, [u8]>>,
@ -8,7 +9,10 @@ pub struct ExternalDocumentsIds<'a> {
} }
impl<'a> ExternalDocumentsIds<'a> { impl<'a> ExternalDocumentsIds<'a> {
pub fn new(hard: fst::Map<Cow<'a, [u8]>>, soft: fst::Map<Cow<'a, [u8]>>) -> ExternalDocumentsIds<'a> { pub fn new(
hard: fst::Map<Cow<'a, [u8]>>,
soft: fst::Map<Cow<'a, [u8]>>,
) -> ExternalDocumentsIds<'a> {
ExternalDocumentsIds { hard, soft } ExternalDocumentsIds { hard, soft }
} }
@ -29,7 +33,7 @@ impl<'a> ExternalDocumentsIds<'a> {
match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) { match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) {
// u64 MAX means deleted in the soft fst map // u64 MAX means deleted in the soft fst map
Some(id) if id != u64::MAX => Some(id.try_into().unwrap()), Some(id) if id != u64::MAX => Some(id.try_into().unwrap()),
_otherwise => None _otherwise => None,
} }
} }

View File

@ -2,10 +2,9 @@ use std::error::Error;
use std::fmt; use std::fmt;
use std::str::FromStr; use std::str::FromStr;
use serde::{Serialize, Deserialize}; use serde::{Deserialize, Serialize};
#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)] #[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[derive(Serialize, Deserialize)]
pub enum FacetType { pub enum FacetType {
String, String,
Number, Number,

View File

@ -50,7 +50,7 @@ impl Serialize for FacetValue {
FacetValue::Number(number) => { FacetValue::Number(number) => {
let string = number.to_string(); let string = number.to_string();
serializer.serialize_str(&string) serializer.serialize_str(&string)
}, }
} }
} }
} }

View File

@ -28,6 +28,7 @@ fn xor_all_bits(mut x: [u8; 8]) -> [u8; 8] {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use std::cmp::Ordering::Less; use std::cmp::Ordering::Less;
use super::*; use super::*;
fn is_sorted<T: Ord>(x: &[T]) -> bool { fn is_sorted<T: Ord>(x: &[T]) -> bool {

View File

@ -1,5 +1,7 @@
use std::collections::BTreeMap; use std::collections::BTreeMap;
use serde::{Serialize, Deserialize};
use serde::{Deserialize, Serialize};
use crate::FieldId; use crate::FieldId;
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
@ -11,11 +13,7 @@ pub struct FieldsIdsMap {
impl FieldsIdsMap { impl FieldsIdsMap {
pub fn new() -> FieldsIdsMap { pub fn new() -> FieldsIdsMap {
FieldsIdsMap { FieldsIdsMap { names_ids: BTreeMap::new(), ids_names: BTreeMap::new(), next_id: Some(0) }
names_ids: BTreeMap::new(),
ids_names: BTreeMap::new(),
next_id: Some(0),
}
} }
/// Returns the number of fields ids in the map. /// Returns the number of fields ids in the map.

View File

@ -71,7 +71,8 @@ impl heed::BytesEncode<'_> for FacetLevelValueF64Codec {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use heed::{BytesEncode, BytesDecode}; use heed::{BytesDecode, BytesEncode};
use super::*; use super::*;
#[test] #[test]

View File

@ -1,8 +1,8 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::convert::TryInto; use std::convert::TryInto;
use crate::{FieldId, DocumentId};
use crate::facet::value_encoding::f64_into_bytes; use crate::facet::value_encoding::f64_into_bytes;
use crate::{DocumentId, FieldId};
pub struct FieldDocIdFacetF64Codec; pub struct FieldDocIdFacetF64Codec;

View File

@ -2,12 +2,17 @@ use std::borrow::Cow;
use std::convert::TryInto; use std::convert::TryInto;
use std::str; use std::str;
use crate::{FieldId, DocumentId}; use crate::{DocumentId, FieldId};
pub struct FieldDocIdFacetStringCodec; pub struct FieldDocIdFacetStringCodec;
impl FieldDocIdFacetStringCodec { impl FieldDocIdFacetStringCodec {
pub fn serialize_into(field_id: FieldId, document_id: DocumentId, value: &str, out: &mut Vec<u8>) { pub fn serialize_into(
field_id: FieldId,
document_id: DocumentId,
value: &str,
out: &mut Vec<u8>,
) {
out.reserve(1 + 4 + value.len()); out.reserve(1 + 4 + value.len());
out.push(field_id); out.push(field_id);
out.extend_from_slice(&document_id.to_be_bytes()); out.extend_from_slice(&document_id.to_be_bytes());

View File

@ -1,4 +1,5 @@
use std::{borrow::Cow, convert::TryInto}; use std::borrow::Cow;
use std::convert::TryInto;
use crate::FieldId; use crate::FieldId;

View File

@ -1,16 +1,18 @@
mod beu32_str_codec; mod beu32_str_codec;
pub mod facet;
mod field_id_word_count_codec;
mod obkv_codec; mod obkv_codec;
mod roaring_bitmap; mod roaring_bitmap;
mod roaring_bitmap_length; mod roaring_bitmap_length;
mod str_level_position_codec; mod str_level_position_codec;
mod str_str_u8_codec; mod str_str_u8_codec;
mod field_id_word_count_codec;
pub mod facet;
pub use self::beu32_str_codec::BEU32StrCodec; pub use self::beu32_str_codec::BEU32StrCodec;
pub use self::field_id_word_count_codec::FieldIdWordCountCodec;
pub use self::obkv_codec::ObkvCodec; pub use self::obkv_codec::ObkvCodec;
pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec}; pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec};
pub use self::roaring_bitmap_length::{BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec}; pub use self::roaring_bitmap_length::{
BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec,
};
pub use self::str_level_position_codec::StrLevelPositionCodec; pub use self::str_level_position_codec::StrLevelPositionCodec;
pub use self::str_str_u8_codec::StrStrU8Codec; pub use self::str_str_u8_codec::StrStrU8Codec;
pub use self::field_id_word_count_codec::FieldIdWordCountCodec;

View File

@ -1,4 +1,5 @@
use std::borrow::Cow; use std::borrow::Cow;
use obkv::{KvReader, KvWriter}; use obkv::{KvReader, KvWriter};
pub struct ObkvCodec; pub struct ObkvCodec;

View File

@ -75,7 +75,9 @@ impl heed::BytesEncode<'_> for CboRoaringBitmapCodec {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use std::iter::FromIterator; use std::iter::FromIterator;
use heed::{BytesEncode, BytesDecode};
use heed::{BytesDecode, BytesEncode};
use super::*; use super::*;
#[test] #[test]

View File

@ -1,4 +1,5 @@
use std::borrow::Cow; use std::borrow::Cow;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
pub struct RoaringBitmapCodec; pub struct RoaringBitmapCodec;

View File

@ -1,7 +1,7 @@
use std::io::{self, Read, BufRead}; use std::io::{self, BufRead, Read};
use std::mem; use std::mem;
use byteorder::{ReadBytesExt, LittleEndian}; use byteorder::{LittleEndian, ReadBytesExt};
const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346; const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346;
const SERIAL_COOKIE: u16 = 12347; const SERIAL_COOKIE: u16 = 12347;
@ -16,20 +16,14 @@ impl RoaringBitmapLenCodec {
if cookie == SERIAL_COOKIE_NO_RUNCONTAINER { if cookie == SERIAL_COOKIE_NO_RUNCONTAINER {
(bytes.read_u32::<LittleEndian>()? as usize, true) (bytes.read_u32::<LittleEndian>()? as usize, true)
} else if (cookie as u16) == SERIAL_COOKIE { } else if (cookie as u16) == SERIAL_COOKIE {
return Err(io::Error::new( return Err(io::Error::new(io::ErrorKind::Other, "run containers are unsupported"));
io::ErrorKind::Other,
"run containers are unsupported",
));
} else { } else {
return Err(io::Error::new(io::ErrorKind::Other, "unknown cookie value")); return Err(io::Error::new(io::ErrorKind::Other, "unknown cookie value"));
} }
}; };
if size > u16::max_value() as usize + 1 { if size > u16::max_value() as usize + 1 {
return Err(io::Error::new( return Err(io::Error::new(io::ErrorKind::Other, "size is greater than supported"));
io::ErrorKind::Other,
"size is greater than supported",
));
} }
let mut description_bytes = vec![0u8; size * 4]; let mut description_bytes = vec![0u8; size * 4];
@ -67,12 +61,12 @@ impl heed::BytesDecode<'_> for RoaringBitmapLenCodec {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*;
use crate::heed_codec::RoaringBitmapCodec;
use heed::BytesEncode; use heed::BytesEncode;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use super::*;
use crate::heed_codec::RoaringBitmapCodec;
#[test] #[test]
fn deserialize_roaring_bitmap_length() { fn deserialize_roaring_bitmap_length() {
let bitmap: RoaringBitmap = (0..500).chain(800..800_000).chain(920_056..930_032).collect(); let bitmap: RoaringBitmap = (0..500).chain(800..800_000).chain(920_056..930_032).collect();

View File

@ -13,7 +13,9 @@ impl<'a> heed::BytesDecode<'a> for StrLevelPositionCodec {
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let footer_len = size_of::<u8>() + size_of::<u32>() * 2; let footer_len = size_of::<u8>() + size_of::<u32>() * 2;
if bytes.len() < footer_len { return None } if bytes.len() < footer_len {
return None;
}
let (word, bytes) = bytes.split_at(bytes.len() - footer_len); let (word, bytes) = bytes.split_at(bytes.len() - footer_len);
let word = str::from_utf8(word).ok()?; let word = str::from_utf8(word).ok()?;

View File

@ -3,23 +3,22 @@ use std::collections::{HashMap, HashSet};
use std::path::Path; use std::path::Path;
use chrono::{DateTime, Utc}; use chrono::{DateTime, Utc};
use heed::{Database, PolyDatabase, RoTxn, RwTxn};
use heed::types::*; use heed::types::*;
use heed::{Database, PolyDatabase, RoTxn, RwTxn};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use crate::error::{UserError, FieldIdMapMissingEntry, InternalError}; use crate::error::{FieldIdMapMissingEntry, InternalError, UserError};
use crate::{Criterion, default_criteria, FacetDistribution, FieldsDistribution, Search};
use crate::{BEU32, DocumentId, ExternalDocumentsIds, FieldId, Result};
use crate::{
BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec,
ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec,
FieldIdWordCountCodec,
};
use crate::heed_codec::facet::{
FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
FacetValueStringCodec, FacetLevelValueF64Codec,
};
use crate::fields_ids_map::FieldsIdsMap; use crate::fields_ids_map::FieldsIdsMap;
use crate::heed_codec::facet::{
FacetLevelValueF64Codec, FacetValueStringCodec, FieldDocIdFacetF64Codec,
FieldDocIdFacetStringCodec,
};
use crate::{
default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion,
DocumentId, ExternalDocumentsIds, FacetDistribution, FieldId, FieldIdWordCountCodec,
FieldsDistribution, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search,
StrLevelPositionCodec, StrStrU8Codec, BEU32,
};
pub mod main_key { pub mod main_key {
pub const CRITERIA_KEY: &str = "criteria"; pub const CRITERIA_KEY: &str = "criteria";
@ -114,14 +113,17 @@ impl Index {
let word_prefix_docids = env.create_database(Some(WORD_PREFIX_DOCIDS))?; let word_prefix_docids = env.create_database(Some(WORD_PREFIX_DOCIDS))?;
let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?; let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?;
let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?; let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
let word_prefix_pair_proximity_docids = env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?; let word_prefix_pair_proximity_docids =
env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?;
let word_level_position_docids = env.create_database(Some(WORD_LEVEL_POSITION_DOCIDS))?; let word_level_position_docids = env.create_database(Some(WORD_LEVEL_POSITION_DOCIDS))?;
let field_id_word_count_docids = env.create_database(Some(FIELD_ID_WORD_COUNT_DOCIDS))?; let field_id_word_count_docids = env.create_database(Some(FIELD_ID_WORD_COUNT_DOCIDS))?;
let word_prefix_level_position_docids = env.create_database(Some(WORD_PREFIX_LEVEL_POSITION_DOCIDS))?; let word_prefix_level_position_docids =
env.create_database(Some(WORD_PREFIX_LEVEL_POSITION_DOCIDS))?;
let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?; let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?;
let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?; let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?;
let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?; let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?;
let field_id_docid_facet_strings = env.create_database(Some(FIELD_ID_DOCID_FACET_STRINGS))?; let field_id_docid_facet_strings =
env.create_database(Some(FIELD_ID_DOCID_FACET_STRINGS))?;
let documents = env.create_database(Some(DOCUMENTS))?; let documents = env.create_database(Some(DOCUMENTS))?;
Index::initialize_creation_dates(&env, main)?; Index::initialize_creation_dates(&env, main)?;
@ -184,18 +186,26 @@ impl Index {
/* documents ids */ /* documents ids */
/// Writes the documents ids that corresponds to the user-ids-documents-ids FST. /// Writes the documents ids that corresponds to the user-ids-documents-ids FST.
pub(crate) fn put_documents_ids(&self, wtxn: &mut RwTxn, docids: &RoaringBitmap) -> heed::Result<()> { pub(crate) fn put_documents_ids(
&self,
wtxn: &mut RwTxn,
docids: &RoaringBitmap,
) -> heed::Result<()> {
self.main.put::<_, Str, RoaringBitmapCodec>(wtxn, main_key::DOCUMENTS_IDS_KEY, docids) self.main.put::<_, Str, RoaringBitmapCodec>(wtxn, main_key::DOCUMENTS_IDS_KEY, docids)
} }
/// Returns the internal documents ids. /// Returns the internal documents ids.
pub fn documents_ids(&self, rtxn: &RoTxn) -> heed::Result<RoaringBitmap> { pub fn documents_ids(&self, rtxn: &RoTxn) -> heed::Result<RoaringBitmap> {
Ok(self.main.get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::DOCUMENTS_IDS_KEY)?.unwrap_or_default()) Ok(self
.main
.get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::DOCUMENTS_IDS_KEY)?
.unwrap_or_default())
} }
/// Returns the number of documents indexed in the database. /// Returns the number of documents indexed in the database.
pub fn number_of_documents(&self, rtxn: &RoTxn) -> Result<u64> { pub fn number_of_documents(&self, rtxn: &RoTxn) -> Result<u64> {
let count = self.main.get::<_, Str, RoaringBitmapLenCodec>(rtxn, main_key::DOCUMENTS_IDS_KEY)?; let count =
self.main.get::<_, Str, RoaringBitmapLenCodec>(rtxn, main_key::DOCUMENTS_IDS_KEY)?;
Ok(count.unwrap_or_default()) Ok(count.unwrap_or_default())
} }
@ -224,21 +234,30 @@ impl Index {
&self, &self,
wtxn: &mut RwTxn, wtxn: &mut RwTxn,
external_documents_ids: &ExternalDocumentsIds<'a>, external_documents_ids: &ExternalDocumentsIds<'a>,
) -> heed::Result<()> ) -> heed::Result<()> {
{
let ExternalDocumentsIds { hard, soft } = external_documents_ids; let ExternalDocumentsIds { hard, soft } = external_documents_ids;
let hard = hard.as_fst().as_bytes(); let hard = hard.as_fst().as_bytes();
let soft = soft.as_fst().as_bytes(); let soft = soft.as_fst().as_bytes();
self.main.put::<_, Str, ByteSlice>(wtxn, main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY, hard)?; self.main.put::<_, Str, ByteSlice>(
self.main.put::<_, Str, ByteSlice>(wtxn, main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY, soft)?; wtxn,
main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY,
hard,
)?;
self.main.put::<_, Str, ByteSlice>(
wtxn,
main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY,
soft,
)?;
Ok(()) Ok(())
} }
/// Returns the external documents ids map which associate the external ids /// Returns the external documents ids map which associate the external ids
/// with the internal ids (i.e. `u32`). /// with the internal ids (i.e. `u32`).
pub fn external_documents_ids<'t>(&self, rtxn: &'t RoTxn) -> Result<ExternalDocumentsIds<'t>> { pub fn external_documents_ids<'t>(&self, rtxn: &'t RoTxn) -> Result<ExternalDocumentsIds<'t>> {
let hard = self.main.get::<_, Str, ByteSlice>(rtxn, main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY)?; let hard =
let soft = self.main.get::<_, Str, ByteSlice>(rtxn, main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY)?; self.main.get::<_, Str, ByteSlice>(rtxn, main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY)?;
let soft =
self.main.get::<_, Str, ByteSlice>(rtxn, main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY)?;
let hard = match hard { let hard = match hard {
Some(hard) => fst::Map::new(hard)?.map_data(Cow::Borrowed)?, Some(hard) => fst::Map::new(hard)?.map_data(Cow::Borrowed)?,
None => fst::Map::default().map_data(Cow::Owned)?, None => fst::Map::default().map_data(Cow::Owned)?,
@ -254,42 +273,62 @@ impl Index {
/// Writes the fields ids map which associate the documents keys with an internal field id /// Writes the fields ids map which associate the documents keys with an internal field id
/// (i.e. `u8`), this field id is used to identify fields in the obkv documents. /// (i.e. `u8`), this field id is used to identify fields in the obkv documents.
pub(crate) fn put_fields_ids_map(&self, wtxn: &mut RwTxn, map: &FieldsIdsMap) -> heed::Result<()> { pub(crate) fn put_fields_ids_map(
&self,
wtxn: &mut RwTxn,
map: &FieldsIdsMap,
) -> heed::Result<()> {
self.main.put::<_, Str, SerdeJson<FieldsIdsMap>>(wtxn, main_key::FIELDS_IDS_MAP_KEY, map) self.main.put::<_, Str, SerdeJson<FieldsIdsMap>>(wtxn, main_key::FIELDS_IDS_MAP_KEY, map)
} }
/// Returns the fields ids map which associate the documents keys with an internal field id /// Returns the fields ids map which associate the documents keys with an internal field id
/// (i.e. `u8`), this field id is used to identify fields in the obkv documents. /// (i.e. `u8`), this field id is used to identify fields in the obkv documents.
pub fn fields_ids_map(&self, rtxn: &RoTxn) -> heed::Result<FieldsIdsMap> { pub fn fields_ids_map(&self, rtxn: &RoTxn) -> heed::Result<FieldsIdsMap> {
Ok(self.main.get::<_, Str, SerdeJson<FieldsIdsMap>>( Ok(self
rtxn, .main
main_key::FIELDS_IDS_MAP_KEY, .get::<_, Str, SerdeJson<FieldsIdsMap>>(rtxn, main_key::FIELDS_IDS_MAP_KEY)?
)?.unwrap_or_default()) .unwrap_or_default())
} }
/* fields distribution */ /* fields distribution */
/// Writes the fields distribution which associates every field name with /// Writes the fields distribution which associates every field name with
/// the number of times it occurs in the documents. /// the number of times it occurs in the documents.
pub(crate) fn put_fields_distribution(&self, wtxn: &mut RwTxn, distribution: &FieldsDistribution) -> heed::Result<()> { pub(crate) fn put_fields_distribution(
self.main.put::<_, Str, SerdeJson<FieldsDistribution>>(wtxn, main_key::FIELDS_DISTRIBUTION_KEY, distribution) &self,
wtxn: &mut RwTxn,
distribution: &FieldsDistribution,
) -> heed::Result<()> {
self.main.put::<_, Str, SerdeJson<FieldsDistribution>>(
wtxn,
main_key::FIELDS_DISTRIBUTION_KEY,
distribution,
)
} }
/// Returns the fields distribution which associates every field name with /// Returns the fields distribution which associates every field name with
/// the number of times it occurs in the documents. /// the number of times it occurs in the documents.
pub fn fields_distribution(&self, rtxn: &RoTxn) -> heed::Result<FieldsDistribution> { pub fn fields_distribution(&self, rtxn: &RoTxn) -> heed::Result<FieldsDistribution> {
Ok(self.main.get::<_, Str, SerdeJson<FieldsDistribution>>( Ok(self
rtxn, .main
main_key::FIELDS_DISTRIBUTION_KEY, .get::<_, Str, SerdeJson<FieldsDistribution>>(rtxn, main_key::FIELDS_DISTRIBUTION_KEY)?
)?.unwrap_or_default()) .unwrap_or_default())
} }
/* displayed fields */ /* displayed fields */
/// Writes the fields that must be displayed in the defined order. /// Writes the fields that must be displayed in the defined order.
/// There must be not be any duplicate field id. /// There must be not be any duplicate field id.
pub(crate) fn put_displayed_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> { pub(crate) fn put_displayed_fields(
self.main.put::<_, Str, SerdeBincode<&[&str]>>(wtxn, main_key::DISPLAYED_FIELDS_KEY, &fields) &self,
wtxn: &mut RwTxn,
fields: &[&str],
) -> heed::Result<()> {
self.main.put::<_, Str, SerdeBincode<&[&str]>>(
wtxn,
main_key::DISPLAYED_FIELDS_KEY,
&fields,
)
} }
/// Deletes the displayed fields ids, this will make the engine to display /// Deletes the displayed fields ids, this will make the engine to display
@ -313,14 +352,17 @@ impl Index {
for name in fields.into_iter() { for name in fields.into_iter() {
match fields_ids_map.id(name) { match fields_ids_map.id(name) {
Some(field_id) => fields_ids.push(field_id), Some(field_id) => fields_ids.push(field_id),
None => return Err(FieldIdMapMissingEntry::FieldName { None => {
return Err(FieldIdMapMissingEntry::FieldName {
field_name: name.to_string(), field_name: name.to_string(),
process: "Index::displayed_fields_ids", process: "Index::displayed_fields_ids",
}.into()), }
.into())
}
} }
} }
Ok(Some(fields_ids)) Ok(Some(fields_ids))
}, }
None => Ok(None), None => Ok(None),
} }
} }
@ -328,8 +370,16 @@ impl Index {
/* searchable fields */ /* searchable fields */
/// Writes the searchable fields, when this list is specified, only these are indexed. /// Writes the searchable fields, when this list is specified, only these are indexed.
pub(crate) fn put_searchable_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> { pub(crate) fn put_searchable_fields(
self.main.put::<_, Str, SerdeBincode<&[&str]>>(wtxn, main_key::SEARCHABLE_FIELDS_KEY, &fields) &self,
wtxn: &mut RwTxn,
fields: &[&str],
) -> heed::Result<()> {
self.main.put::<_, Str, SerdeBincode<&[&str]>>(
wtxn,
main_key::SEARCHABLE_FIELDS_KEY,
&fields,
)
} }
/// Deletes the searchable fields, when no fields are specified, all fields are indexed. /// Deletes the searchable fields, when no fields are specified, all fields are indexed.
@ -352,14 +402,17 @@ impl Index {
for name in fields { for name in fields {
match fields_ids_map.id(name) { match fields_ids_map.id(name) {
Some(field_id) => fields_ids.push(field_id), Some(field_id) => fields_ids.push(field_id),
None => return Err(FieldIdMapMissingEntry::FieldName { None => {
return Err(FieldIdMapMissingEntry::FieldName {
field_name: name.to_string(), field_name: name.to_string(),
process: "Index::searchable_fields_ids", process: "Index::searchable_fields_ids",
}.into()), }
.into())
}
} }
} }
Ok(Some(fields_ids)) Ok(Some(fields_ids))
}, }
None => Ok(None), None => Ok(None),
} }
} }
@ -367,7 +420,11 @@ impl Index {
/* filterable fields */ /* filterable fields */
/// Writes the filterable fields names in the database. /// Writes the filterable fields names in the database.
pub(crate) fn put_filterable_fields(&self, wtxn: &mut RwTxn, fields: &HashSet<String>) -> heed::Result<()> { pub(crate) fn put_filterable_fields(
&self,
wtxn: &mut RwTxn,
fields: &HashSet<String>,
) -> heed::Result<()> {
self.main.put::<_, Str, SerdeJson<_>>(wtxn, main_key::FILTERABLE_FIELDS_KEY, fields) self.main.put::<_, Str, SerdeJson<_>>(wtxn, main_key::FILTERABLE_FIELDS_KEY, fields)
} }
@ -378,10 +435,10 @@ impl Index {
/// Returns the filterable fields names. /// Returns the filterable fields names.
pub fn filterable_fields(&self, rtxn: &RoTxn) -> heed::Result<HashSet<String>> { pub fn filterable_fields(&self, rtxn: &RoTxn) -> heed::Result<HashSet<String>> {
Ok(self.main.get::<_, Str, SerdeJson<_>>( Ok(self
rtxn, .main
main_key::FILTERABLE_FIELDS_KEY, .get::<_, Str, SerdeJson<_>>(rtxn, main_key::FILTERABLE_FIELDS_KEY)?
)?.unwrap_or_default()) .unwrap_or_default())
} }
/// Identical to `filterable_fields`, but returns ids instead. /// Identical to `filterable_fields`, but returns ids instead.
@ -394,11 +451,14 @@ impl Index {
match fields_ids_map.id(&name) { match fields_ids_map.id(&name) {
Some(field_id) => { Some(field_id) => {
fields_ids.insert(field_id); fields_ids.insert(field_id);
}, }
None => return Err(FieldIdMapMissingEntry::FieldName { None => {
return Err(FieldIdMapMissingEntry::FieldName {
field_name: name, field_name: name,
process: "Index::filterable_fields_ids", process: "Index::filterable_fields_ids",
}.into()), }
.into())
}
} }
} }
@ -413,9 +473,8 @@ impl Index {
pub fn faceted_fields(&self, rtxn: &RoTxn) -> Result<HashSet<String>> { pub fn faceted_fields(&self, rtxn: &RoTxn) -> Result<HashSet<String>> {
let filterable_fields = self.filterable_fields(rtxn)?; let filterable_fields = self.filterable_fields(rtxn)?;
let distinct_field = self.distinct_field(rtxn)?; let distinct_field = self.distinct_field(rtxn)?;
let asc_desc_fields = self.criteria(rtxn)? let asc_desc_fields =
.into_iter() self.criteria(rtxn)?.into_iter().filter_map(|criterion| match criterion {
.filter_map(|criterion| match criterion {
Criterion::Asc(field) | Criterion::Desc(field) => Some(field), Criterion::Asc(field) | Criterion::Desc(field) => Some(field),
_otherwise => None, _otherwise => None,
}); });
@ -439,11 +498,14 @@ impl Index {
match fields_ids_map.id(&name) { match fields_ids_map.id(&name) {
Some(field_id) => { Some(field_id) => {
fields_ids.insert(field_id); fields_ids.insert(field_id);
}, }
None => return Err(FieldIdMapMissingEntry::FieldName { None => {
return Err(FieldIdMapMissingEntry::FieldName {
field_name: name, field_name: name,
process: "Index::faceted_fields_ids", process: "Index::faceted_fields_ids",
}.into()), }
.into())
}
} }
} }
@ -458,8 +520,7 @@ impl Index {
wtxn: &mut RwTxn, wtxn: &mut RwTxn,
field_id: FieldId, field_id: FieldId,
docids: &RoaringBitmap, docids: &RoaringBitmap,
) -> heed::Result<()> ) -> heed::Result<()> {
{
let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()]
.copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); .copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
@ -472,8 +533,7 @@ impl Index {
&self, &self,
rtxn: &RoTxn, rtxn: &RoTxn,
field_id: FieldId, field_id: FieldId,
) -> heed::Result<RoaringBitmap> ) -> heed::Result<RoaringBitmap> {
{
let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()]
.copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); .copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
@ -490,8 +550,7 @@ impl Index {
wtxn: &mut RwTxn, wtxn: &mut RwTxn,
field_id: FieldId, field_id: FieldId,
docids: &RoaringBitmap, docids: &RoaringBitmap,
) -> heed::Result<()> ) -> heed::Result<()> {
{
let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()]
.copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); .copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
@ -504,8 +563,7 @@ impl Index {
&self, &self,
rtxn: &RoTxn, rtxn: &RoTxn,
field_id: FieldId, field_id: FieldId,
) -> heed::Result<RoaringBitmap> ) -> heed::Result<RoaringBitmap> {
{
let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()]
.copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); .copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
@ -518,7 +576,11 @@ impl Index {
/* distinct field */ /* distinct field */
pub(crate) fn put_distinct_field(&self, wtxn: &mut RwTxn, distinct_field: &str) -> heed::Result<()> { pub(crate) fn put_distinct_field(
&self,
wtxn: &mut RwTxn,
distinct_field: &str,
) -> heed::Result<()> {
self.main.put::<_, Str, Str>(wtxn, main_key::DISTINCT_FIELD_KEY, distinct_field) self.main.put::<_, Str, Str>(wtxn, main_key::DISTINCT_FIELD_KEY, distinct_field)
} }
@ -532,7 +594,11 @@ impl Index {
/* criteria */ /* criteria */
pub(crate) fn put_criteria(&self, wtxn: &mut RwTxn, criteria: &[Criterion]) -> heed::Result<()> { pub(crate) fn put_criteria(
&self,
wtxn: &mut RwTxn,
criteria: &[Criterion],
) -> heed::Result<()> {
self.main.put::<_, Str, SerdeJson<&[Criterion]>>(wtxn, main_key::CRITERIA_KEY, &criteria) self.main.put::<_, Str, SerdeJson<&[Criterion]>>(wtxn, main_key::CRITERIA_KEY, &criteria)
} }
@ -550,7 +616,11 @@ impl Index {
/* words fst */ /* words fst */
/// Writes the FST which is the words dictionary of the engine. /// Writes the FST which is the words dictionary of the engine.
pub(crate) fn put_words_fst<A: AsRef<[u8]>>(&self, wtxn: &mut RwTxn, fst: &fst::Set<A>) -> heed::Result<()> { pub(crate) fn put_words_fst<A: AsRef<[u8]>>(
&self,
wtxn: &mut RwTxn,
fst: &fst::Set<A>,
) -> heed::Result<()> {
self.main.put::<_, Str, ByteSlice>(wtxn, main_key::WORDS_FST_KEY, fst.as_fst().as_bytes()) self.main.put::<_, Str, ByteSlice>(wtxn, main_key::WORDS_FST_KEY, fst.as_fst().as_bytes())
} }
@ -564,7 +634,11 @@ impl Index {
/* stop words */ /* stop words */
pub(crate) fn put_stop_words<A: AsRef<[u8]>>(&self, wtxn: &mut RwTxn, fst: &fst::Set<A>) -> heed::Result<()> { pub(crate) fn put_stop_words<A: AsRef<[u8]>>(
&self,
wtxn: &mut RwTxn,
fst: &fst::Set<A>,
) -> heed::Result<()> {
self.main.put::<_, Str, ByteSlice>(wtxn, main_key::STOP_WORDS_KEY, fst.as_fst().as_bytes()) self.main.put::<_, Str, ByteSlice>(wtxn, main_key::STOP_WORDS_KEY, fst.as_fst().as_bytes())
} }
@ -585,8 +659,7 @@ impl Index {
&self, &self,
wtxn: &mut RwTxn, wtxn: &mut RwTxn,
synonyms: &HashMap<Vec<String>, Vec<Vec<String>>>, synonyms: &HashMap<Vec<String>, Vec<Vec<String>>>,
) -> heed::Result<()> ) -> heed::Result<()> {
{
self.main.put::<_, Str, SerdeBincode<_>>(wtxn, main_key::SYNONYMS_KEY, synonyms) self.main.put::<_, Str, SerdeBincode<_>>(wtxn, main_key::SYNONYMS_KEY, synonyms)
} }
@ -595,15 +668,17 @@ impl Index {
} }
pub fn synonyms(&self, rtxn: &RoTxn) -> heed::Result<HashMap<Vec<String>, Vec<Vec<String>>>> { pub fn synonyms(&self, rtxn: &RoTxn) -> heed::Result<HashMap<Vec<String>, Vec<Vec<String>>>> {
Ok(self.main.get::<_, Str, SerdeBincode<_>>(rtxn, main_key::SYNONYMS_KEY)?.unwrap_or_default()) Ok(self
.main
.get::<_, Str, SerdeBincode<_>>(rtxn, main_key::SYNONYMS_KEY)?
.unwrap_or_default())
} }
pub fn words_synonyms<S: AsRef<str>>( pub fn words_synonyms<S: AsRef<str>>(
&self, &self,
rtxn: &RoTxn, rtxn: &RoTxn,
words: &[S], words: &[S],
) -> heed::Result<Option<Vec<Vec<String>>>> ) -> heed::Result<Option<Vec<Vec<String>>>> {
{
let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect();
Ok(self.synonyms(rtxn)?.remove(&words)) Ok(self.synonyms(rtxn)?.remove(&words))
} }
@ -611,8 +686,16 @@ impl Index {
/* words prefixes fst */ /* words prefixes fst */
/// Writes the FST which is the words prefixes dictionnary of the engine. /// Writes the FST which is the words prefixes dictionnary of the engine.
pub(crate) fn put_words_prefixes_fst<A: AsRef<[u8]>>(&self, wtxn: &mut RwTxn, fst: &fst::Set<A>) -> heed::Result<()> { pub(crate) fn put_words_prefixes_fst<A: AsRef<[u8]>>(
self.main.put::<_, Str, ByteSlice>(wtxn, main_key::WORDS_PREFIXES_FST_KEY, fst.as_fst().as_bytes()) &self,
wtxn: &mut RwTxn,
fst: &fst::Set<A>,
) -> heed::Result<()> {
self.main.put::<_, Str, ByteSlice>(
wtxn,
main_key::WORDS_PREFIXES_FST_KEY,
fst.as_fst().as_bytes(),
)
} }
/// Returns the FST which is the words prefixes dictionnary of the engine. /// Returns the FST which is the words prefixes dictionnary of the engine.
@ -638,12 +721,13 @@ impl Index {
&self, &self,
rtxn: &'t RoTxn, rtxn: &'t RoTxn,
ids: impl IntoIterator<Item = DocumentId>, ids: impl IntoIterator<Item = DocumentId>,
) -> Result<Vec<(DocumentId, obkv::KvReader<'t>)>> ) -> Result<Vec<(DocumentId, obkv::KvReader<'t>)>> {
{
let mut documents = Vec::new(); let mut documents = Vec::new();
for id in ids { for id in ids {
let kv = self.documents.get(rtxn, &BEU32::new(id))? let kv = self
.documents
.get(rtxn, &BEU32::new(id))?
.ok_or_else(|| UserError::UnknownInternalDocumentId { document_id: id })?; .ok_or_else(|| UserError::UnknownInternalDocumentId { document_id: id })?;
documents.push((id, kv)); documents.push((id, kv));
} }
@ -673,7 +757,8 @@ impl Index {
/// Returns the index creation time. /// Returns the index creation time.
pub fn created_at(&self, rtxn: &RoTxn) -> Result<DateTime<Utc>> { pub fn created_at(&self, rtxn: &RoTxn) -> Result<DateTime<Utc>> {
Ok(self.main Ok(self
.main
.get::<_, Str, SerdeJson<DateTime<Utc>>>(rtxn, main_key::CREATED_AT_KEY)? .get::<_, Str, SerdeJson<DateTime<Utc>>>(rtxn, main_key::CREATED_AT_KEY)?
.ok_or(InternalError::DatabaseMissingEntry { .ok_or(InternalError::DatabaseMissingEntry {
db_name: db_name::MAIN, db_name: db_name::MAIN,
@ -683,7 +768,8 @@ impl Index {
/// Returns the index last updated time. /// Returns the index last updated time.
pub fn updated_at(&self, rtxn: &RoTxn) -> Result<DateTime<Utc>> { pub fn updated_at(&self, rtxn: &RoTxn) -> Result<DateTime<Utc>> {
Ok(self.main Ok(self
.main
.get::<_, Str, SerdeJson<DateTime<Utc>>>(rtxn, main_key::UPDATED_AT_KEY)? .get::<_, Str, SerdeJson<DateTime<Utc>>>(rtxn, main_key::UPDATED_AT_KEY)?
.ok_or(InternalError::DatabaseMissingEntry { .ok_or(InternalError::DatabaseMissingEntry {
db_name: db_name::MAIN, db_name: db_name::MAIN,
@ -691,7 +777,11 @@ impl Index {
})?) })?)
} }
pub(crate) fn set_updated_at(&self, wtxn: &mut RwTxn, time: &DateTime<Utc>) -> heed::Result<()> { pub(crate) fn set_updated_at(
&self,
wtxn: &mut RwTxn,
time: &DateTime<Utc>,
) -> heed::Result<()> {
self.main.put::<_, Str, SerdeJson<DateTime<Utc>>>(wtxn, main_key::UPDATED_AT_KEY, &time) self.main.put::<_, Str, SerdeJson<DateTime<Utc>>>(wtxn, main_key::UPDATED_AT_KEY, &time)
} }
} }
@ -704,8 +794,8 @@ pub(crate) mod tests {
use maplit::hashmap; use maplit::hashmap;
use tempfile::TempDir; use tempfile::TempDir;
use crate::Index;
use crate::update::{IndexDocuments, UpdateFormat}; use crate::update::{IndexDocuments, UpdateFormat};
use crate::Index;
pub(crate) struct TempIndex { pub(crate) struct TempIndex {
inner: Index, inner: Index,
@ -728,10 +818,7 @@ pub(crate) mod tests {
options.map_size(100 * 4096); options.map_size(100 * 4096);
let _tempdir = TempDir::new_in(".").unwrap(); let _tempdir = TempDir::new_in(".").unwrap();
let inner = Index::new(options, _tempdir.path()).unwrap(); let inner = Index::new(options, _tempdir.path()).unwrap();
Self { Self { inner, _tempdir }
inner,
_tempdir
}
} }
} }
@ -756,10 +843,13 @@ pub(crate) mod tests {
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
let fields_distribution = index.fields_distribution(&rtxn).unwrap(); let fields_distribution = index.fields_distribution(&rtxn).unwrap();
assert_eq!(fields_distribution, hashmap! { assert_eq!(
fields_distribution,
hashmap! {
"id".to_string() => 2, "id".to_string() => 2,
"name".to_string() => 2, "name".to_string() => 2,
"age".to_string() => 1, "age".to_string() => 1,
}); }
);
} }
} }

View File

@ -1,14 +1,15 @@
#[macro_use] extern crate pest_derive; #[macro_use]
extern crate pest_derive;
mod criterion; mod criterion;
mod error; mod error;
mod external_documents_ids; mod external_documents_ids;
mod fields_ids_map;
mod search;
pub mod facet; pub mod facet;
mod fields_ids_map;
pub mod heed_codec; pub mod heed_codec;
pub mod index; pub mod index;
pub mod proximity; pub mod proximity;
mod search;
pub mod tree_level; pub mod tree_level;
pub mod update; pub mod update;
@ -20,15 +21,17 @@ use std::result::Result as StdResult;
use fxhash::{FxHasher32, FxHasher64}; use fxhash::{FxHasher32, FxHasher64};
use serde_json::{Map, Value}; use serde_json::{Map, Value};
pub use self::criterion::{Criterion, default_criteria}; pub use self::criterion::{default_criteria, Criterion};
pub use self::error::Error; pub use self::error::Error;
pub use self::external_documents_ids::ExternalDocumentsIds; pub use self::external_documents_ids::ExternalDocumentsIds;
pub use self::fields_ids_map::FieldsIdsMap; pub use self::fields_ids_map::FieldsIdsMap;
pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, StrLevelPositionCodec, ObkvCodec, FieldIdWordCountCodec}; pub use self::heed_codec::{
pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec}; BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapCodec,
pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec}; CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec, RoaringBitmapCodec,
RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec,
};
pub use self::index::Index; pub use self::index::Index;
pub use self::search::{Search, FacetDistribution, FilterCondition, SearchResult, MatchingWords}; pub use self::search::{FacetDistribution, FilterCondition, MatchingWords, Search, SearchResult};
pub use self::tree_level::TreeLevel; pub use self::tree_level::TreeLevel;
pub type Result<T> = std::result::Result<T, error::Error>; pub type Result<T> = std::result::Result<T, error::Error>;
@ -54,9 +57,9 @@ pub fn obkv_to_json(
displayed_fields: &[FieldId], displayed_fields: &[FieldId],
fields_ids_map: &FieldsIdsMap, fields_ids_map: &FieldsIdsMap,
obkv: obkv::KvReader, obkv: obkv::KvReader,
) -> Result<Map<String, Value>> ) -> Result<Map<String, Value>> {
{ displayed_fields
displayed_fields.iter() .iter()
.copied() .copied()
.flat_map(|id| obkv.get(id).map(|value| (id, value))) .flat_map(|id| obkv.get(id).map(|value| (id, value)))
.map(|(id, value)| { .map(|(id, value)| {
@ -72,7 +75,6 @@ pub fn obkv_to_json(
/// Transform a JSON value into a string that can be indexed. /// Transform a JSON value into a string that can be indexed.
pub fn json_to_string(value: &Value) -> Option<String> { pub fn json_to_string(value: &Value) -> Option<String> {
fn inner(value: &Value, output: &mut String) -> bool { fn inner(value: &Value, output: &mut String) -> bool {
use std::fmt::Write; use std::fmt::Write;
match value { match value {
@ -90,7 +92,7 @@ pub fn json_to_string(value: &Value) -> Option<String> {
} }
// check that at least one value was written // check that at least one value was written
count != 0 count != 0
}, }
Value::Object(object) => { Value::Object(object) => {
let mut buffer = String::new(); let mut buffer = String::new();
let mut count = 0; let mut count = 0;
@ -107,7 +109,7 @@ pub fn json_to_string(value: &Value) -> Option<String> {
} }
// check that at least one value was written // check that at least one value was written
count != 0 count != 0
}, }
} }
} }
@ -121,9 +123,10 @@ pub fn json_to_string(value: &Value) -> Option<String> {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*;
use serde_json::json; use serde_json::json;
use super::*;
#[test] #[test]
fn json_to_string_object() { fn json_to_string_object() {
let value = json!({ let value = json!({

View File

@ -1,4 +1,5 @@
use std::cmp; use std::cmp;
use crate::{Attribute, Position}; use crate::{Attribute, Position};
const ONE_ATTRIBUTE: u32 = 1000; const ONE_ATTRIBUTE: u32 = 1000;
@ -15,8 +16,11 @@ pub fn index_proximity(lhs: u32, rhs: u32) -> u32 {
pub fn positions_proximity(lhs: Position, rhs: Position) -> u32 { pub fn positions_proximity(lhs: Position, rhs: Position) -> u32 {
let (lhs_attr, lhs_index) = extract_position(lhs); let (lhs_attr, lhs_index) = extract_position(lhs);
let (rhs_attr, rhs_index) = extract_position(rhs); let (rhs_attr, rhs_index) = extract_position(rhs);
if lhs_attr != rhs_attr { MAX_DISTANCE } if lhs_attr != rhs_attr {
else { index_proximity(lhs_index, rhs_index) } MAX_DISTANCE
} else {
index_proximity(lhs_index, rhs_index)
}
} }
pub fn extract_position(position: Position) -> (Attribute, Position) { pub fn extract_position(position: Position) -> (Attribute, Position) {

View File

@ -5,12 +5,12 @@ use log::debug;
use ordered_float::OrderedFloat; use ordered_float::OrderedFloat;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use super::{Criterion, CriterionParameters, CriterionResult};
use crate::error::FieldIdMapMissingEntry; use crate::error::FieldIdMapMissingEntry;
use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder};
use crate::search::facet::FacetIter; use crate::search::facet::FacetIter;
use crate::search::query_tree::Operation; use crate::search::query_tree::Operation;
use crate::{FieldId, Index, Result}; use crate::{FieldId, Index, Result};
use super::{Criterion, CriterionParameters, CriterionResult};
/// Threshold on the number of candidates that will make /// Threshold on the number of candidates that will make
/// the system to choose between one algorithm or another. /// the system to choose between one algorithm or another.
@ -57,9 +57,8 @@ impl<'t> AscDesc<'t> {
ascending: bool, ascending: bool,
) -> Result<Self> { ) -> Result<Self> {
let fields_ids_map = index.fields_ids_map(rtxn)?; let fields_ids_map = index.fields_ids_map(rtxn)?;
let field_id = fields_ids_map let field_id =
.id(&field_name) fields_ids_map.id(&field_name).ok_or_else(|| FieldIdMapMissingEntry::FieldName {
.ok_or_else(|| FieldIdMapMissingEntry::FieldName {
field_name: field_name.clone(), field_name: field_name.clone(),
process: "AscDesc::new", process: "AscDesc::new",
})?; })?;
@ -101,17 +100,21 @@ impl<'t> Criterion for AscDesc<'t> {
filtered_candidates: None, filtered_candidates: None,
bucket_candidates: Some(take(&mut self.bucket_candidates)), bucket_candidates: Some(take(&mut self.bucket_candidates)),
})); }));
}, }
None => { None => match self.parent.next(params)? {
match self.parent.next(params)? { Some(CriterionResult {
Some(CriterionResult { query_tree, candidates, filtered_candidates, bucket_candidates }) => { query_tree,
candidates,
filtered_candidates,
bucket_candidates,
}) => {
self.query_tree = query_tree; self.query_tree = query_tree;
let mut candidates = match (&self.query_tree, candidates) { let mut candidates = match (&self.query_tree, candidates) {
(_, Some(candidates)) => candidates, (_, Some(candidates)) => candidates,
(Some(qt), None) => { (Some(qt), None) => {
let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; let context = CriteriaBuilder::new(&self.rtxn, &self.index)?;
resolve_query_tree(&context, qt, params.wdcache)? resolve_query_tree(&context, qt, params.wdcache)?
}, }
(None, None) => self.index.documents_ids(self.rtxn)?, (None, None) => self.index.documents_ids(self.rtxn)?,
}; };
@ -136,9 +139,8 @@ impl<'t> Criterion for AscDesc<'t> {
self.ascending, self.ascending,
candidates & &self.faceted_candidates, candidates & &self.faceted_candidates,
)?; )?;
},
None => return Ok(None),
} }
None => return Ok(None),
}, },
Some(mut candidates) => { Some(mut candidates) => {
candidates -= params.excluded_candidates; candidates -= params.excluded_candidates;
@ -170,11 +172,8 @@ fn facet_ordered<'t>(
let iter = iterative_facet_ordered_iter(index, rtxn, field_id, ascending, candidates)?; let iter = iterative_facet_ordered_iter(index, rtxn, field_id, ascending, candidates)?;
Ok(Box::new(iter.map(Ok)) as Box<dyn Iterator<Item = _>>) Ok(Box::new(iter.map(Ok)) as Box<dyn Iterator<Item = _>>)
} else { } else {
let facet_fn = if ascending { let facet_fn =
FacetIter::new_reducing if ascending { FacetIter::new_reducing } else { FacetIter::new_reverse_reducing };
} else {
FacetIter::new_reverse_reducing
};
let iter = facet_fn(rtxn, index, field_id, candidates)?; let iter = facet_fn(rtxn, index, field_id, candidates)?;
Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids)))) Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids))))
} }
@ -194,9 +193,7 @@ fn iterative_facet_ordered_iter<'t>(
for docid in candidates.iter() { for docid in candidates.iter() {
let left = (field_id, docid, f64::MIN); let left = (field_id, docid, f64::MIN);
let right = (field_id, docid, f64::MAX); let right = (field_id, docid, f64::MAX);
let mut iter = index let mut iter = index.field_id_docid_facet_f64s.range(rtxn, &(left..=right))?;
.field_id_docid_facet_f64s
.range(rtxn, &(left..=right))?;
let entry = if ascending { iter.next() } else { iter.last() }; let entry = if ascending { iter.next() } else { iter.last() };
if let Some(((_, _, value), ())) = entry.transpose()? { if let Some(((_, _, value), ())) = entry.transpose()? {
docids_values.push((docid, OrderedFloat(value))); docids_values.push((docid, OrderedFloat(value)));

View File

@ -1,15 +1,16 @@
use std::{borrow::Cow, cmp::{self, Ordering}, collections::BinaryHeap}; use std::borrow::Cow;
use std::collections::{BTreeMap, HashMap, btree_map}; use std::cmp::{self, Ordering};
use std::collections::binary_heap::PeekMut; use std::collections::binary_heap::PeekMut;
use std::collections::{btree_map, BTreeMap, BinaryHeap, HashMap};
use std::mem::take; use std::mem::take;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use crate::{TreeLevel, Result, search::build_dfa}; use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult};
use crate::search::criteria::Query; use crate::search::criteria::Query;
use crate::search::query_tree::{Operation, QueryKind}; use crate::search::query_tree::{Operation, QueryKind};
use crate::search::{word_derivations, WordDerivationsCache}; use crate::search::{build_dfa, word_derivations, WordDerivationsCache};
use super::{Criterion, CriterionParameters, CriterionResult, Context, resolve_query_tree}; use crate::{Result, TreeLevel};
/// To be able to divide integers by the number of words in the query /// To be able to divide integers by the number of words in the query
/// we want to find a multiplier that allow us to divide by any number between 1 and 10. /// we want to find a multiplier that allow us to divide by any number between 1 and 10.
@ -63,15 +64,19 @@ impl<'t> Criterion for Attribute<'t> {
filtered_candidates: None, filtered_candidates: None,
bucket_candidates: Some(take(&mut self.bucket_candidates)), bucket_candidates: Some(take(&mut self.bucket_candidates)),
})); }));
}, }
Some((query_tree, flattened_query_tree, mut allowed_candidates)) => { Some((query_tree, flattened_query_tree, mut allowed_candidates)) => {
let found_candidates = if allowed_candidates.len() < CANDIDATES_THRESHOLD { let found_candidates = if allowed_candidates.len() < CANDIDATES_THRESHOLD {
let current_buckets = match self.current_buckets.as_mut() { let current_buckets = match self.current_buckets.as_mut() {
Some(current_buckets) => current_buckets, Some(current_buckets) => current_buckets,
None => { None => {
let new_buckets = linear_compute_candidates(self.ctx, &flattened_query_tree, &allowed_candidates)?; let new_buckets = linear_compute_candidates(
self.ctx,
&flattened_query_tree,
&allowed_candidates,
)?;
self.current_buckets.get_or_insert(new_buckets.into_iter()) self.current_buckets.get_or_insert(new_buckets.into_iter())
}, }
}; };
match current_buckets.next() { match current_buckets.next() {
@ -83,10 +88,15 @@ impl<'t> Criterion for Attribute<'t> {
filtered_candidates: None, filtered_candidates: None,
bucket_candidates: Some(take(&mut self.bucket_candidates)), bucket_candidates: Some(take(&mut self.bucket_candidates)),
})); }));
}, }
} }
} else { } else {
match set_compute_candidates(self.ctx, &flattened_query_tree, &allowed_candidates, params.wdcache)? { match set_compute_candidates(
self.ctx,
&flattened_query_tree,
&allowed_candidates,
params.wdcache,
)? {
Some(candidates) => candidates, Some(candidates) => candidates,
None => { None => {
return Ok(Some(CriterionResult { return Ok(Some(CriterionResult {
@ -95,13 +105,14 @@ impl<'t> Criterion for Attribute<'t> {
filtered_candidates: None, filtered_candidates: None,
bucket_candidates: Some(take(&mut self.bucket_candidates)), bucket_candidates: Some(take(&mut self.bucket_candidates)),
})); }));
}, }
} }
}; };
allowed_candidates -= &found_candidates; allowed_candidates -= &found_candidates;
self.state = Some((query_tree.clone(), flattened_query_tree, allowed_candidates)); self.state =
Some((query_tree.clone(), flattened_query_tree, allowed_candidates));
return Ok(Some(CriterionResult { return Ok(Some(CriterionResult {
query_tree: Some(query_tree), query_tree: Some(query_tree),
@ -109,13 +120,20 @@ impl<'t> Criterion for Attribute<'t> {
filtered_candidates: None, filtered_candidates: None,
bucket_candidates: Some(take(&mut self.bucket_candidates)), bucket_candidates: Some(take(&mut self.bucket_candidates)),
})); }));
}, }
None => { None => match self.parent.next(params)? {
match self.parent.next(params)? { Some(CriterionResult {
Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => { query_tree: Some(query_tree),
candidates,
filtered_candidates,
bucket_candidates,
}) => {
let mut candidates = match candidates { let mut candidates = match candidates {
Some(candidates) => candidates, Some(candidates) => candidates,
None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates, None => {
resolve_query_tree(self.ctx, &query_tree, params.wdcache)?
- params.excluded_candidates
}
}; };
if let Some(filtered_candidates) = filtered_candidates { if let Some(filtered_candidates) = filtered_candidates {
@ -131,17 +149,21 @@ impl<'t> Criterion for Attribute<'t> {
self.state = Some((query_tree, flattened_query_tree, candidates)); self.state = Some((query_tree, flattened_query_tree, candidates));
self.current_buckets = None; self.current_buckets = None;
}, }
Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => { Some(CriterionResult {
query_tree: None,
candidates,
filtered_candidates,
bucket_candidates,
}) => {
return Ok(Some(CriterionResult { return Ok(Some(CriterionResult {
query_tree: None, query_tree: None,
candidates, candidates,
filtered_candidates, filtered_candidates,
bucket_candidates, bucket_candidates,
})); }));
},
None => return Ok(None),
} }
None => return Ok(None),
}, },
} }
} }
@ -152,7 +174,9 @@ impl<'t> Criterion for Attribute<'t> {
/// it will begin at the first non-empty interval and will return every interval without /// it will begin at the first non-empty interval and will return every interval without
/// jumping over empty intervals. /// jumping over empty intervals.
struct WordLevelIterator<'t, 'q> { struct WordLevelIterator<'t, 'q> {
inner: Box<dyn Iterator<Item =heed::Result<((&'t str, TreeLevel, u32, u32), RoaringBitmap)>> + 't>, inner: Box<
dyn Iterator<Item = heed::Result<((&'t str, TreeLevel, u32, u32), RoaringBitmap)>> + 't,
>,
level: TreeLevel, level: TreeLevel,
interval_size: u32, interval_size: u32,
word: Cow<'q, str>, word: Cow<'q, str>,
@ -162,50 +186,81 @@ struct WordLevelIterator<'t, 'q> {
} }
impl<'t, 'q> WordLevelIterator<'t, 'q> { impl<'t, 'q> WordLevelIterator<'t, 'q> {
fn new(ctx: &'t dyn Context<'t>, word: Cow<'q, str>, in_prefix_cache: bool) -> heed::Result<Option<Self>> { fn new(
ctx: &'t dyn Context<'t>,
word: Cow<'q, str>,
in_prefix_cache: bool,
) -> heed::Result<Option<Self>> {
match ctx.word_position_last_level(&word, in_prefix_cache)? { match ctx.word_position_last_level(&word, in_prefix_cache)? {
Some(level) => { Some(level) => {
let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::<u8>::into(level) as u32); let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::<u8>::into(level) as u32);
let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, None, None)?; let inner =
Ok(Some(Self { inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None })) ctx.word_position_iterator(&word, level, in_prefix_cache, None, None)?;
}, Ok(Some(Self {
inner,
level,
interval_size,
word,
in_prefix_cache,
inner_next: None,
current_interval: None,
}))
}
None => Ok(None), None => Ok(None),
} }
} }
fn dig(&self, ctx: &'t dyn Context<'t>, level: &TreeLevel, left_interval: Option<u32>) -> heed::Result<Self> { fn dig(
&self,
ctx: &'t dyn Context<'t>,
level: &TreeLevel,
left_interval: Option<u32>,
) -> heed::Result<Self> {
let level = *level.min(&self.level); let level = *level.min(&self.level);
let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::<u8>::into(level) as u32); let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::<u8>::into(level) as u32);
let word = self.word.clone(); let word = self.word.clone();
let in_prefix_cache = self.in_prefix_cache; let in_prefix_cache = self.in_prefix_cache;
let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, left_interval, None)?; let inner =
ctx.word_position_iterator(&word, level, in_prefix_cache, left_interval, None)?;
Ok(Self {inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None}) Ok(Self {
inner,
level,
interval_size,
word,
in_prefix_cache,
inner_next: None,
current_interval: None,
})
} }
fn next(&mut self) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> { fn next(&mut self) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> {
fn is_next_interval(last_right: u32, next_left: u32) -> bool { last_right + 1 == next_left } fn is_next_interval(last_right: u32, next_left: u32) -> bool {
last_right + 1 == next_left
}
let inner_next = match self.inner_next.take() { let inner_next = match self.inner_next.take() {
Some(inner_next) => Some(inner_next), Some(inner_next) => Some(inner_next),
None => self.inner.next().transpose()?.map(|((_, _, left, right), docids)| (left, right, docids)), None => self
.inner
.next()
.transpose()?
.map(|((_, _, left, right), docids)| (left, right, docids)),
}; };
match inner_next { match inner_next {
Some((left, right, docids)) => { Some((left, right, docids)) => match self.current_interval {
match self.current_interval {
Some((last_left, last_right)) if !is_next_interval(last_right, left) => { Some((last_left, last_right)) if !is_next_interval(last_right, left) => {
let blank_left = last_left + self.interval_size; let blank_left = last_left + self.interval_size;
let blank_right = last_right + self.interval_size; let blank_right = last_right + self.interval_size;
self.current_interval = Some((blank_left, blank_right)); self.current_interval = Some((blank_left, blank_right));
self.inner_next = Some((left, right, docids)); self.inner_next = Some((left, right, docids));
Ok(Some((blank_left, blank_right, RoaringBitmap::new()))) Ok(Some((blank_left, blank_right, RoaringBitmap::new())))
}, }
_ => { _ => {
self.current_interval = Some((left, right)); self.current_interval = Some((left, right));
Ok(Some((left, right, docids))) Ok(Some((left, right, docids)))
} }
}
}, },
None => Ok(None), None => Ok(None),
} }
@ -228,30 +283,37 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> {
ctx: &'t dyn Context<'t>, ctx: &'t dyn Context<'t>,
queries: &'q [Query], queries: &'q [Query],
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> Result<Option<Self>> ) -> Result<Option<Self>> {
{
let mut inner = Vec::with_capacity(queries.len()); let mut inner = Vec::with_capacity(queries.len());
for query in queries { for query in queries {
match &query.kind { match &query.kind {
QueryKind::Exact { word, .. } => { QueryKind::Exact { word, .. } => {
if !query.prefix || ctx.in_prefix_cache(&word) { if !query.prefix || ctx.in_prefix_cache(&word) {
let word = Cow::Borrowed(query.kind.word()); let word = Cow::Borrowed(query.kind.word());
if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, query.prefix)? { if let Some(word_level_iterator) =
WordLevelIterator::new(ctx, word, query.prefix)?
{
inner.push(word_level_iterator); inner.push(word_level_iterator);
} }
} else { } else {
for (word, _) in word_derivations(&word, true, 0, ctx.words_fst(), wdcache)? { for (word, _) in word_derivations(&word, true, 0, ctx.words_fst(), wdcache)?
{
let word = Cow::Owned(word.to_owned()); let word = Cow::Owned(word.to_owned());
if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)? { if let Some(word_level_iterator) =
WordLevelIterator::new(ctx, word, false)?
{
inner.push(word_level_iterator); inner.push(word_level_iterator);
} }
} }
} }
}, }
QueryKind::Tolerant { typo, word } => { QueryKind::Tolerant { typo, word } => {
for (word, _) in word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)? { for (word, _) in
word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)?
{
let word = Cow::Owned(word.to_owned()); let word = Cow::Owned(word.to_owned());
if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)? { if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)?
{
inner.push(word_level_iterator); inner.push(word_level_iterator);
} }
} }
@ -284,17 +346,28 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> {
Some(parent) => { Some(parent) => {
let parent = parent.dig(ctx)?; let parent = parent.dig(ctx)?;
(parent.level.min(self.level), Some(Box::new(parent))) (parent.level.min(self.level), Some(Box::new(parent)))
}, }
None => (self.level.saturating_sub(1), None), None => (self.level.saturating_sub(1), None),
}; };
let left_interval = self.accumulator.get(self.interval_to_skip).map(|opt| opt.as_ref().map(|(left, _, _)| *left)).flatten(); let left_interval = self
.accumulator
.get(self.interval_to_skip)
.map(|opt| opt.as_ref().map(|(left, _, _)| *left))
.flatten();
let mut inner = Vec::with_capacity(self.inner.len()); let mut inner = Vec::with_capacity(self.inner.len());
for word_level_iterator in self.inner.iter() { for word_level_iterator in self.inner.iter() {
inner.push(word_level_iterator.dig(ctx, &level, left_interval)?); inner.push(word_level_iterator.dig(ctx, &level, left_interval)?);
} }
Ok(Self {parent, inner, level, accumulator: vec![], parent_accumulator: vec![], interval_to_skip: 0}) Ok(Self {
parent,
inner,
level,
accumulator: vec![],
parent_accumulator: vec![],
interval_to_skip: 0,
})
} }
fn inner_next(&mut self, level: TreeLevel) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> { fn inner_next(&mut self, level: TreeLevel) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> {
@ -310,7 +383,7 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> {
Some((acc_left, acc_right, mut acc_docids)) => { Some((acc_left, acc_right, mut acc_docids)) => {
acc_docids |= next_docids; acc_docids |= next_docids;
Some((acc_left, acc_right, acc_docids)) Some((acc_left, acc_right, acc_docids))
}, }
None => Some((next_left, next_left + interval_size, next_docids)), None => Some((next_left, next_left + interval_size, next_docids)),
}; };
} }
@ -322,7 +395,11 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> {
/// return the next meta-interval created from inner WordLevelIterators, /// return the next meta-interval created from inner WordLevelIterators,
/// and from eventual chainned QueryLevelIterator. /// and from eventual chainned QueryLevelIterator.
fn next(&mut self, allowed_candidates: &RoaringBitmap, tree_level: TreeLevel) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> { fn next(
&mut self,
allowed_candidates: &RoaringBitmap,
tree_level: TreeLevel,
) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> {
let parent_result = match self.parent.as_mut() { let parent_result = match self.parent.as_mut() {
Some(parent) => Some(parent.next(allowed_candidates, tree_level)?), Some(parent) => Some(parent.next(allowed_candidates, tree_level)?),
None => None, None => None,
@ -335,22 +412,30 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> {
&self.parent_accumulator, &self.parent_accumulator,
&self.accumulator, &self.accumulator,
self.interval_to_skip, self.interval_to_skip,
allowed_candidates allowed_candidates,
); );
self.accumulator.push(inner_next); self.accumulator.push(inner_next);
self.parent_accumulator.push(parent_next); self.parent_accumulator.push(parent_next);
let mut merged_interval: Option<(u32, u32, RoaringBitmap)> = None; let mut merged_interval: Option<(u32, u32, RoaringBitmap)> = None;
for current in self.accumulator.iter().rev().zip(self.parent_accumulator.iter()).skip(self.interval_to_skip) { for current in self
.accumulator
.iter()
.rev()
.zip(self.parent_accumulator.iter())
.skip(self.interval_to_skip)
{
if let (Some((left_a, right_a, a)), Some((left_b, right_b, b))) = current { if let (Some((left_a, right_a, a)), Some((left_b, right_b, b))) = current {
match merged_interval.as_mut() { match merged_interval.as_mut() {
Some((_, _, merged_docids)) => *merged_docids |= a & b, Some((_, _, merged_docids)) => *merged_docids |= a & b,
None => merged_interval = Some((left_a + left_b, right_a + right_b, a & b)), None => {
merged_interval = Some((left_a + left_b, right_a + right_b, a & b))
}
} }
} }
} }
Ok(merged_interval) Ok(merged_interval)
}, }
None => { None => {
let level = self.level; let level = self.level;
match self.inner_next(level)? { match self.inner_next(level)? {
@ -358,12 +443,11 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> {
self.accumulator = vec![Some((left, right, RoaringBitmap::new()))]; self.accumulator = vec![Some((left, right, RoaringBitmap::new()))];
candidates &= allowed_candidates; candidates &= allowed_candidates;
Ok(Some((left, right, candidates))) Ok(Some((left, right, candidates)))
}
},
None => { None => {
self.accumulator = vec![None]; self.accumulator = vec![None];
Ok(None) Ok(None)
}, }
} }
} }
} }
@ -379,16 +463,18 @@ fn interval_to_skip(
already_skiped: usize, already_skiped: usize,
allowed_candidates: &RoaringBitmap, allowed_candidates: &RoaringBitmap,
) -> usize { ) -> usize {
parent_accumulator.iter() parent_accumulator
.iter()
.zip(current_accumulator.iter()) .zip(current_accumulator.iter())
.skip(already_skiped) .skip(already_skiped)
.take_while(|(parent, current)| { .take_while(|(parent, current)| {
let skip_parent = parent.as_ref().map_or(true, |(_, _, docids)| docids.is_empty()); let skip_parent = parent.as_ref().map_or(true, |(_, _, docids)| docids.is_empty());
let skip_current = current.as_ref().map_or(true, |(_, _, docids)| docids.is_disjoint(allowed_candidates)); let skip_current = current
.as_ref()
.map_or(true, |(_, _, docids)| docids.is_disjoint(allowed_candidates));
skip_parent && skip_current skip_parent && skip_current
}) })
.count() .count()
} }
/// A Branch is represent a possible alternative of the original query and is build with the Query Tree, /// A Branch is represent a possible alternative of the original query and is build with the Query Tree,
@ -410,7 +496,7 @@ impl<'t, 'q> Branch<'t, 'q> {
self.last_result = last_result; self.last_result = last_result;
self.tree_level = tree_level; self.tree_level = tree_level;
Ok(true) Ok(true)
}, }
None => Ok(false), None => Ok(false),
} }
} }
@ -477,7 +563,6 @@ fn initialize_query_level_iterators<'t, 'q>(
allowed_candidates: &RoaringBitmap, allowed_candidates: &RoaringBitmap,
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> Result<BinaryHeap<Branch<'t, 'q>>> { ) -> Result<BinaryHeap<Branch<'t, 'q>>> {
let mut positions = BinaryHeap::with_capacity(branches.len()); let mut positions = BinaryHeap::with_capacity(branches.len());
for branch in branches { for branch in branches {
let mut branch_positions = Vec::with_capacity(branch.len()); let mut branch_positions = Vec::with_capacity(branch.len());
@ -488,19 +573,20 @@ fn initialize_query_level_iterators<'t, 'q>(
// the branch seems to be invalid, so we skip it. // the branch seems to be invalid, so we skip it.
branch_positions.clear(); branch_positions.clear();
break; break;
}, }
} }
} }
// QueryLevelIterator need to be sorted by level and folded in descending order. // QueryLevelIterator need to be sorted by level and folded in descending order.
branch_positions.sort_unstable_by_key(|qli| qli.level); branch_positions.sort_unstable_by_key(|qli| qli.level);
let folded_query_level_iterators = branch_positions let folded_query_level_iterators =
.into_iter() branch_positions.into_iter().fold(None, |fold: Option<QueryLevelIterator>, mut qli| {
.fold(None, |fold: Option<QueryLevelIterator>, mut qli| match fold { match fold {
Some(fold) => { Some(fold) => {
qli.parent(fold); qli.parent(fold);
Some(qli) Some(qli)
}, }
None => Some(qli), None => Some(qli),
}
}); });
if let Some(mut folded_query_level_iterators) = folded_query_level_iterators { if let Some(mut folded_query_level_iterators) = folded_query_level_iterators {
@ -526,9 +612,9 @@ fn set_compute_candidates<'t>(
branches: &FlattenedQueryTree, branches: &FlattenedQueryTree,
allowed_candidates: &RoaringBitmap, allowed_candidates: &RoaringBitmap,
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> Result<Option<RoaringBitmap>> ) -> Result<Option<RoaringBitmap>> {
{ let mut branches_heap =
let mut branches_heap = initialize_query_level_iterators(ctx, branches, allowed_candidates, wdcache)?; initialize_query_level_iterators(ctx, branches, allowed_candidates, wdcache)?;
let lowest_level = TreeLevel::min_value(); let lowest_level = TreeLevel::min_value();
let mut final_candidates: Option<(u32, RoaringBitmap)> = None; let mut final_candidates: Option<(u32, RoaringBitmap)> = None;
let mut allowed_candidates = allowed_candidates.clone(); let mut allowed_candidates = allowed_candidates.clone();
@ -539,15 +625,18 @@ fn set_compute_candidates<'t>(
// if current is worst than best we break to return // if current is worst than best we break to return
// candidates that correspond to the best rank // candidates that correspond to the best rank
if let Some((best_rank, _)) = final_candidates { if let Some((best_rank, _)) = final_candidates {
if branch_rank > best_rank { break } if branch_rank > best_rank {
break;
}
} }
let _left = branch.last_result.0; let _left = branch.last_result.0;
let candidates = take(&mut branch.last_result.2); let candidates = take(&mut branch.last_result.2);
if candidates.is_empty() { if candidates.is_empty() {
// we don't have candidates, get next interval. // we don't have candidates, get next interval.
if !branch.next(&allowed_candidates)? { PeekMut::pop(branch); } if !branch.next(&allowed_candidates)? {
PeekMut::pop(branch);
} }
else if is_lowest_level { } else if is_lowest_level {
// we have candidates, but we can't dig deeper. // we have candidates, but we can't dig deeper.
allowed_candidates -= &candidates; allowed_candidates -= &candidates;
final_candidates = match final_candidates.take() { final_candidates = match final_candidates.take() {
@ -556,19 +645,20 @@ fn set_compute_candidates<'t>(
best_candidates |= candidates; best_candidates |= candidates;
branch.lazy_next(); branch.lazy_next();
Some((best_rank, best_candidates)) Some((best_rank, best_candidates))
}, }
// we take current candidates as best candidates // we take current candidates as best candidates
None => { None => {
branch.lazy_next(); branch.lazy_next();
Some((branch_rank, candidates)) Some((branch_rank, candidates))
}, }
}; };
} else { } else {
// we have candidates, lets dig deeper in levels. // we have candidates, lets dig deeper in levels.
branch.dig(ctx)?; branch.dig(ctx)?;
if !branch.next(&allowed_candidates)? { PeekMut::pop(branch); } if !branch.next(&allowed_candidates)? {
PeekMut::pop(branch);
}
} }
} }
Ok(final_candidates.map(|(_rank, candidates)| candidates)) Ok(final_candidates.map(|(_rank, candidates)| candidates))
@ -578,9 +668,11 @@ fn linear_compute_candidates(
ctx: &dyn Context, ctx: &dyn Context,
branches: &FlattenedQueryTree, branches: &FlattenedQueryTree,
allowed_candidates: &RoaringBitmap, allowed_candidates: &RoaringBitmap,
) -> Result<BTreeMap<u64, RoaringBitmap>> ) -> Result<BTreeMap<u64, RoaringBitmap>> {
{ fn compute_candidate_rank(
fn compute_candidate_rank(branches: &FlattenedQueryTree, words_positions: HashMap<String, RoaringBitmap>) -> u64 { branches: &FlattenedQueryTree,
words_positions: HashMap<String, RoaringBitmap>,
) -> u64 {
let mut min_rank = u64::max_value(); let mut min_rank = u64::max_value();
for branch in branches { for branch in branches {
let branch_len = branch.len(); let branch_len = branch.len();
@ -593,17 +685,20 @@ fn linear_compute_candidates(
QueryKind::Exact { word, .. } => { QueryKind::Exact { word, .. } => {
if *prefix { if *prefix {
word_derivations(word, true, 0, &words_positions) word_derivations(word, true, 0, &words_positions)
.flat_map(|positions| positions.iter().next()).min() .flat_map(|positions| positions.iter().next())
.min()
} else { } else {
words_positions.get(word) words_positions
.get(word)
.map(|positions| positions.iter().next()) .map(|positions| positions.iter().next())
.flatten() .flatten()
} }
}, }
QueryKind::Tolerant { typo, word } => { QueryKind::Tolerant { typo, word } => {
word_derivations(word, *prefix, *typo, &words_positions) word_derivations(word, *prefix, *typo, &words_positions)
.flat_map(|positions| positions.iter().next()).min() .flat_map(|positions| positions.iter().next())
}, .min()
}
}; };
match (position, current_position) { match (position, current_position) {
@ -627,9 +722,11 @@ fn linear_compute_candidates(
branch_rank.sort_unstable(); branch_rank.sort_unstable();
// because several words in same query can't match all a the position 0, // because several words in same query can't match all a the position 0,
// we substract the word index to the position. // we substract the word index to the position.
let branch_rank: u64 = branch_rank.into_iter().enumerate().map(|(i, r)| r - i as u64).sum(); let branch_rank: u64 =
branch_rank.into_iter().enumerate().map(|(i, r)| r - i as u64).sum();
// here we do the means of the words of the branch // here we do the means of the words of the branch
min_rank = min_rank.min(branch_rank * LCM_10_FIRST_NUMBERS as u64 / branch_len as u64); min_rank =
min_rank.min(branch_rank * LCM_10_FIRST_NUMBERS as u64 / branch_len as u64);
} }
} }
@ -641,8 +738,7 @@ fn linear_compute_candidates(
is_prefix: bool, is_prefix: bool,
max_typo: u8, max_typo: u8,
words_positions: &'a HashMap<String, RoaringBitmap>, words_positions: &'a HashMap<String, RoaringBitmap>,
) -> impl Iterator<Item = &'a RoaringBitmap> ) -> impl Iterator<Item = &'a RoaringBitmap> {
{
let dfa = build_dfa(word, max_typo, is_prefix); let dfa = build_dfa(word, max_typo, is_prefix);
words_positions.iter().filter_map(move |(document_word, positions)| { words_positions.iter().filter_map(move |(document_word, positions)| {
use levenshtein_automata::Distance; use levenshtein_automata::Distance;
@ -680,25 +776,26 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
} }
} }
out out
}, }
None => recurse(head), None => recurse(head),
} }
} }
fn recurse(op: &Operation) -> FlattenedQueryTree { fn recurse(op: &Operation) -> FlattenedQueryTree {
match op { match op {
And(ops) => { And(ops) => ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t)),
ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t)) Or(_, ops) => {
}, if ops.iter().all(|op| op.query().is_some()) {
Or(_, ops) => if ops.iter().all(|op| op.query().is_some()) {
vec![vec![ops.iter().flat_map(|op| op.query()).cloned().collect()]] vec![vec![ops.iter().flat_map(|op| op.query()).cloned().collect()]]
} else { } else {
ops.iter().map(recurse).flatten().collect() ops.iter().map(recurse).flatten().collect()
}, }
}
Phrase(words) => { Phrase(words) => {
let queries = words.iter().map(|word| { let queries = words
vec![Query {prefix: false, kind: QueryKind::exact(word.clone())}] .iter()
}).collect(); .map(|word| vec![Query { prefix: false, kind: QueryKind::exact(word.clone()) }])
.collect();
vec![queries] vec![queries]
} }
Operation::Query(query) => vec![vec![vec![query.clone()]]], Operation::Query(query) => vec![vec![vec![query.clone()]]],
@ -712,12 +809,14 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
mod tests { mod tests {
use big_s::S; use big_s::S;
use crate::search::criteria::QueryKind;
use super::*; use super::*;
use crate::search::criteria::QueryKind;
#[test] #[test]
fn simple_flatten_query_tree() { fn simple_flatten_query_tree() {
let query_tree = Operation::Or(false, vec![ let query_tree = Operation::Or(
false,
vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }), Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }),
Operation::And(vec![ Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythe")) }), Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythe")) }),
@ -725,15 +824,28 @@ mod tests {
]), ]),
Operation::And(vec![ Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("many")) }), Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("many")) }),
Operation::Or(false, vec![ Operation::Or(
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("thefish")) }), false,
vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact(S("thefish")),
}),
Operation::And(vec![ Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("the")) }), Operation::Query(Query {
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("fish")) }), prefix: false,
kind: QueryKind::exact(S("the")),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact(S("fish")),
}),
]), ]),
],
),
]), ]),
]), ],
]); );
let expected = vec![ let expected = vec![
vec![vec![Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }]], vec![vec![Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }]],

View File

@ -2,19 +2,15 @@ use std::convert::TryFrom;
use std::mem::take; use std::mem::take;
use std::ops::BitOr; use std::ops::BitOr;
use itertools::Itertools;
use log::debug; use log::debug;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use itertools::Itertools;
use crate::search::query_tree::{Operation, PrimitiveQueryPart};
use crate::search::criteria::{ use crate::search::criteria::{
Context, resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult,
Criterion,
CriterionParameters,
CriterionResult,
resolve_query_tree,
}; };
use crate::{TreeLevel, Result}; use crate::search::query_tree::{Operation, PrimitiveQueryPart};
use crate::{Result, TreeLevel};
pub struct Exactness<'t> { pub struct Exactness<'t> {
ctx: &'t dyn Context<'t>, ctx: &'t dyn Context<'t>,
@ -26,7 +22,11 @@ pub struct Exactness<'t> {
} }
impl<'t> Exactness<'t> { impl<'t> Exactness<'t> {
pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>, primitive_query: &[PrimitiveQueryPart]) -> heed::Result<Self> { pub fn new(
ctx: &'t dyn Context<'t>,
parent: Box<dyn Criterion + 't>,
primitive_query: &[PrimitiveQueryPart],
) -> heed::Result<Self> {
let mut query: Vec<_> = Vec::with_capacity(primitive_query.len()); let mut query: Vec<_> = Vec::with_capacity(primitive_query.len());
for part in primitive_query { for part in primitive_query {
query.push(ExactQueryPart::from_primitive_query_part(ctx, part)?); query.push(ExactQueryPart::from_primitive_query_part(ctx, part)?);
@ -59,7 +59,7 @@ impl<'t> Criterion for Exactness<'t> {
// reset state // reset state
self.state = None; self.state = None;
self.query_tree = None; self.query_tree = None;
}, }
Some(state) => { Some(state) => {
let (candidates, state) = resolve_state(self.ctx, take(state), &self.query)?; let (candidates, state) = resolve_state(self.ctx, take(state), &self.query)?;
self.state = state; self.state = state;
@ -70,13 +70,20 @@ impl<'t> Criterion for Exactness<'t> {
filtered_candidates: None, filtered_candidates: None,
bucket_candidates: Some(take(&mut self.bucket_candidates)), bucket_candidates: Some(take(&mut self.bucket_candidates)),
})); }));
}, }
None => { None => match self.parent.next(params)? {
match self.parent.next(params)? { Some(CriterionResult {
Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => { query_tree: Some(query_tree),
candidates,
filtered_candidates,
bucket_candidates,
}) => {
let mut candidates = match candidates { let mut candidates = match candidates {
Some(candidates) => candidates, Some(candidates) => candidates,
None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates, None => {
resolve_query_tree(self.ctx, &query_tree, params.wdcache)?
- params.excluded_candidates
}
}; };
if let Some(filtered_candidates) = filtered_candidates { if let Some(filtered_candidates) = filtered_candidates {
@ -90,17 +97,21 @@ impl<'t> Criterion for Exactness<'t> {
self.state = Some(State::new(candidates)); self.state = Some(State::new(candidates));
self.query_tree = Some(query_tree); self.query_tree = Some(query_tree);
}, }
Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => { Some(CriterionResult {
query_tree: None,
candidates,
filtered_candidates,
bucket_candidates,
}) => {
return Ok(Some(CriterionResult { return Ok(Some(CriterionResult {
query_tree: None, query_tree: None,
candidates, candidates,
filtered_candidates, filtered_candidates,
bucket_candidates, bucket_candidates,
})); }));
},
None => return Ok(None),
} }
None => return Ok(None),
}, },
} }
} }
@ -125,9 +136,9 @@ impl State {
fn difference_with(&mut self, lhs: &RoaringBitmap) { fn difference_with(&mut self, lhs: &RoaringBitmap) {
match self { match self {
Self::ExactAttribute(candidates) | Self::ExactAttribute(candidates)
Self::AttributeStartsWith(candidates) | | Self::AttributeStartsWith(candidates)
Self::ExactWords(candidates) => *candidates -= lhs, | Self::ExactWords(candidates) => *candidates -= lhs,
Self::Remainings(candidates_array) => { Self::Remainings(candidates_array) => {
candidates_array.iter_mut().for_each(|candidates| *candidates -= lhs); candidates_array.iter_mut().for_each(|candidates| *candidates -= lhs);
candidates_array.retain(|candidates| !candidates.is_empty()); candidates_array.retain(|candidates| !candidates.is_empty());
@ -137,9 +148,9 @@ impl State {
fn is_empty(&self) -> bool { fn is_empty(&self) -> bool {
match self { match self {
Self::ExactAttribute(candidates) | Self::ExactAttribute(candidates)
Self::AttributeStartsWith(candidates) | | Self::AttributeStartsWith(candidates)
Self::ExactWords(candidates) => candidates.is_empty(), | Self::ExactWords(candidates) => candidates.is_empty(),
Self::Remainings(candidates_array) => { Self::Remainings(candidates_array) => {
candidates_array.iter().all(RoaringBitmap::is_empty) candidates_array.iter().all(RoaringBitmap::is_empty)
} }
@ -158,8 +169,7 @@ fn resolve_state(
ctx: &dyn Context, ctx: &dyn Context,
state: State, state: State,
query: &[ExactQueryPart], query: &[ExactQueryPart],
) -> Result<(RoaringBitmap, Option<State>)> ) -> Result<(RoaringBitmap, Option<State>)> {
{
use State::*; use State::*;
match state { match state {
ExactAttribute(mut allowed_candidates) => { ExactAttribute(mut allowed_candidates) => {
@ -167,8 +177,11 @@ fn resolve_state(
if let Ok(query_len) = u8::try_from(query.len()) { if let Ok(query_len) = u8::try_from(query.len()) {
let attributes_ids = ctx.searchable_fields_ids()?; let attributes_ids = ctx.searchable_fields_ids()?;
for id in attributes_ids { for id in attributes_ids {
if let Some(attribute_allowed_docids) = ctx.field_id_word_count_docids(id, query_len)? { if let Some(attribute_allowed_docids) =
let mut attribute_candidates_array = attribute_start_with_docids(ctx, id as u32, query)?; ctx.field_id_word_count_docids(id, query_len)?
{
let mut attribute_candidates_array =
attribute_start_with_docids(ctx, id as u32, query)?;
attribute_candidates_array.push(attribute_allowed_docids); attribute_candidates_array.push(attribute_allowed_docids);
candidates |= intersection_of(attribute_candidates_array.iter().collect()); candidates |= intersection_of(attribute_candidates_array.iter().collect());
} }
@ -181,12 +194,13 @@ fn resolve_state(
} }
Ok((candidates, Some(AttributeStartsWith(allowed_candidates)))) Ok((candidates, Some(AttributeStartsWith(allowed_candidates))))
}, }
AttributeStartsWith(mut allowed_candidates) => { AttributeStartsWith(mut allowed_candidates) => {
let mut candidates = RoaringBitmap::new(); let mut candidates = RoaringBitmap::new();
let attributes_ids = ctx.searchable_fields_ids()?; let attributes_ids = ctx.searchable_fields_ids()?;
for id in attributes_ids { for id in attributes_ids {
let attribute_candidates_array = attribute_start_with_docids(ctx, id as u32, query)?; let attribute_candidates_array =
attribute_start_with_docids(ctx, id as u32, query)?;
candidates |= intersection_of(attribute_candidates_array.iter().collect()); candidates |= intersection_of(attribute_candidates_array.iter().collect());
} }
@ -195,7 +209,7 @@ fn resolve_state(
// remove current candidates from allowed candidates // remove current candidates from allowed candidates
allowed_candidates -= &candidates; allowed_candidates -= &candidates;
Ok((candidates, Some(ExactWords(allowed_candidates)))) Ok((candidates, Some(ExactWords(allowed_candidates))))
}, }
ExactWords(mut allowed_candidates) => { ExactWords(mut allowed_candidates) => {
let number_of_part = query.len(); let number_of_part = query.len();
let mut parts_candidates_array = Vec::with_capacity(number_of_part); let mut parts_candidates_array = Vec::with_capacity(number_of_part);
@ -210,7 +224,7 @@ fn resolve_state(
candidates |= synonym_candidates; candidates |= synonym_candidates;
} }
} }
}, }
// compute intersection on pair of words with a proximity of 0. // compute intersection on pair of words with a proximity of 0.
Phrase(phrase) => { Phrase(phrase) => {
let mut bitmaps = Vec::with_capacity(phrase.len().saturating_sub(1)); let mut bitmaps = Vec::with_capacity(phrase.len().saturating_sub(1));
@ -220,8 +234,8 @@ fn resolve_state(
Some(docids) => bitmaps.push(docids), Some(docids) => bitmaps.push(docids),
None => { None => {
bitmaps.clear(); bitmaps.clear();
break break;
}, }
} }
} }
} }
@ -261,7 +275,7 @@ fn resolve_state(
candidates_array.reverse(); candidates_array.reverse();
Ok((all_exact_candidates, Some(Remainings(candidates_array)))) Ok((all_exact_candidates, Some(Remainings(candidates_array))))
}, }
// pop remainings candidates until the emptiness // pop remainings candidates until the emptiness
Remainings(mut candidates_array) => { Remainings(mut candidates_array) => {
let candidates = candidates_array.pop().unwrap_or_default(); let candidates = candidates_array.pop().unwrap_or_default();
@ -270,12 +284,15 @@ fn resolve_state(
} else { } else {
Ok((candidates, None)) Ok((candidates, None))
} }
}, }
} }
} }
fn attribute_start_with_docids(ctx: &dyn Context, attribute_id: u32, query: &[ExactQueryPart]) -> heed::Result<Vec<RoaringBitmap>> { fn attribute_start_with_docids(
ctx: &dyn Context,
attribute_id: u32,
query: &[ExactQueryPart],
) -> heed::Result<Vec<RoaringBitmap>> {
let lowest_level = TreeLevel::min_value(); let lowest_level = TreeLevel::min_value();
let mut attribute_candidates_array = Vec::new(); let mut attribute_candidates_array = Vec::new();
// start from attribute first position // start from attribute first position
@ -293,7 +310,7 @@ fn attribute_start_with_docids(ctx: &dyn Context, attribute_id: u32, query: &[Ex
} }
attribute_candidates_array.push(synonyms_candidates); attribute_candidates_array.push(synonyms_candidates);
pos += 1; pos += 1;
}, }
Phrase(phrase) => { Phrase(phrase) => {
for word in phrase { for word in phrase {
let wc = ctx.word_level_position_docids(word, lowest_level, pos, pos)?; let wc = ctx.word_level_position_docids(word, lowest_level, pos, pos)?;
@ -325,24 +342,30 @@ pub enum ExactQueryPart {
} }
impl ExactQueryPart { impl ExactQueryPart {
fn from_primitive_query_part(ctx: &dyn Context, part: &PrimitiveQueryPart) -> heed::Result<Self> { fn from_primitive_query_part(
ctx: &dyn Context,
part: &PrimitiveQueryPart,
) -> heed::Result<Self> {
let part = match part { let part = match part {
PrimitiveQueryPart::Word(word, _) => { PrimitiveQueryPart::Word(word, _) => {
match ctx.synonyms(word)? { match ctx.synonyms(word)? {
Some(synonyms) => { Some(synonyms) => {
let mut synonyms: Vec<_> = synonyms.into_iter().filter_map(|mut array| { let mut synonyms: Vec<_> = synonyms
.into_iter()
.filter_map(|mut array| {
// keep 1 word synonyms only. // keep 1 word synonyms only.
match array.pop() { match array.pop() {
Some(word) if array.is_empty() => Some(word), Some(word) if array.is_empty() => Some(word),
_ => None, _ => None,
} }
}).collect(); })
.collect();
synonyms.push(word.clone()); synonyms.push(word.clone());
ExactQueryPart::Synonyms(synonyms) ExactQueryPart::Synonyms(synonyms)
}, }
None => ExactQueryPart::Synonyms(vec![word.clone()]), None => ExactQueryPart::Synonyms(vec![word.clone()]),
} }
}, }
PrimitiveQueryPart::Phrase(phrase) => ExactQueryPart::Phrase(phrase.clone()), PrimitiveQueryPart::Phrase(phrase) => ExactQueryPart::Phrase(phrase.clone()),
}; };

View File

@ -1,10 +1,10 @@
use log::debug; use log::debug;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use crate::Result; use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult};
use crate::search::query_tree::Operation; use crate::search::query_tree::Operation;
use crate::search::WordDerivationsCache; use crate::search::WordDerivationsCache;
use super::{resolve_query_tree, Criterion, CriterionResult, CriterionParameters, Context}; use crate::Result;
/// The result of a call to the fetcher. /// The result of a call to the fetcher.
#[derive(Debug, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq)]
@ -26,7 +26,12 @@ pub struct Final<'t> {
impl<'t> Final<'t> { impl<'t> Final<'t> {
pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Final<'t> { pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Final<'t> {
Final { ctx, parent, wdcache: WordDerivationsCache::new(), returned_candidates: RoaringBitmap::new() } Final {
ctx,
parent,
wdcache: WordDerivationsCache::new(),
returned_candidates: RoaringBitmap::new(),
}
} }
#[logging_timer::time("Final::{}")] #[logging_timer::time("Final::{}")]
@ -40,10 +45,17 @@ impl<'t> Final<'t> {
}; };
match self.parent.next(&mut criterion_parameters)? { match self.parent.next(&mut criterion_parameters)? {
Some(CriterionResult { query_tree, candidates, filtered_candidates, bucket_candidates }) => { Some(CriterionResult {
query_tree,
candidates,
filtered_candidates,
bucket_candidates,
}) => {
let mut candidates = match (candidates, query_tree.as_ref()) { let mut candidates = match (candidates, query_tree.as_ref()) {
(Some(candidates), _) => candidates, (Some(candidates), _) => candidates,
(None, Some(qt)) => resolve_query_tree(self.ctx, qt, &mut self.wdcache)? - excluded_candidates, (None, Some(qt)) => {
resolve_query_tree(self.ctx, qt, &mut self.wdcache)? - excluded_candidates
}
(None, None) => self.ctx.documents_ids()? - excluded_candidates, (None, None) => self.ctx.documents_ids()? - excluded_candidates,
}; };
@ -56,7 +68,7 @@ impl<'t> Final<'t> {
self.returned_candidates |= &candidates; self.returned_candidates |= &candidates;
Ok(Some(FinalResult { query_tree, candidates, bucket_candidates })) Ok(Some(FinalResult { query_tree, candidates, bucket_candidates }))
}, }
None => Ok(None), None => Ok(None),
} }
} }

View File

@ -1,15 +1,18 @@
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use crate::Result; use super::{Criterion, CriterionParameters, CriterionResult};
use crate::search::query_tree::Operation; use crate::search::query_tree::Operation;
use super::{Criterion, CriterionResult, CriterionParameters}; use crate::Result;
pub struct Initial { pub struct Initial {
answer: Option<CriterionResult> answer: Option<CriterionResult>,
} }
impl Initial { impl Initial {
pub fn new(query_tree: Option<Operation>, filtered_candidates: Option<RoaringBitmap>) -> Initial { pub fn new(
query_tree: Option<Operation>,
filtered_candidates: Option<RoaringBitmap>,
) -> Initial {
let answer = CriterionResult { let answer = CriterionResult {
query_tree, query_tree,
candidates: None, candidates: None,

View File

@ -1,29 +1,28 @@
use std::collections::HashMap;
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::HashMap;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use crate::{FieldId, TreeLevel, search::{word_derivations, WordDerivationsCache}};
use crate::{Index, DocumentId, Result};
use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind};
use self::asc_desc::AscDesc; use self::asc_desc::AscDesc;
use self::attribute::Attribute; use self::attribute::Attribute;
use self::exactness::Exactness; use self::exactness::Exactness;
use self::r#final::Final;
use self::initial::Initial; use self::initial::Initial;
use self::proximity::Proximity; use self::proximity::Proximity;
use self::r#final::Final;
use self::typo::Typo; use self::typo::Typo;
use self::words::Words; use self::words::Words;
use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind};
use crate::search::{word_derivations, WordDerivationsCache};
use crate::{DocumentId, FieldId, Index, Result, TreeLevel};
mod asc_desc; mod asc_desc;
mod attribute; mod attribute;
mod exactness; mod exactness;
pub mod r#final;
mod initial; mod initial;
mod proximity; mod proximity;
mod typo; mod typo;
mod words; mod words;
pub mod r#final;
pub trait Criterion { pub trait Criterion {
fn next(&mut self, params: &mut CriterionParameters) -> Result<Option<CriterionResult>>; fn next(&mut self, params: &mut CriterionParameters) -> Result<Option<CriterionResult>>;
@ -55,7 +54,7 @@ pub struct CriterionParameters<'a> {
#[derive(Debug)] #[derive(Debug)]
enum Candidates { enum Candidates {
Allowed(RoaringBitmap), Allowed(RoaringBitmap),
Forbidden(RoaringBitmap) Forbidden(RoaringBitmap),
} }
impl Default for Candidates { impl Default for Candidates {
@ -68,17 +67,55 @@ pub trait Context<'c> {
fn documents_ids(&self) -> heed::Result<RoaringBitmap>; fn documents_ids(&self) -> heed::Result<RoaringBitmap>;
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>; fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
fn word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>; fn word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>>; fn word_pair_proximity_docids(
fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>>; &self,
left: &str,
right: &str,
proximity: u8,
) -> heed::Result<Option<RoaringBitmap>>;
fn word_prefix_pair_proximity_docids(
&self,
left: &str,
right: &str,
proximity: u8,
) -> heed::Result<Option<RoaringBitmap>>;
fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>>; fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>>;
fn in_prefix_cache(&self, word: &str) -> bool; fn in_prefix_cache(&self, word: &str) -> bool;
fn docid_words_positions(&self, docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>>; fn docid_words_positions(
fn word_position_iterator(&self, word: &str, level: TreeLevel, in_prefix_cache: bool, left: Option<u32>, right: Option<u32>) -> heed::Result<Box<dyn Iterator<Item =heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c>>; &self,
fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result<Option<TreeLevel>>; docid: DocumentId,
) -> heed::Result<HashMap<String, RoaringBitmap>>;
fn word_position_iterator(
&self,
word: &str,
level: TreeLevel,
in_prefix_cache: bool,
left: Option<u32>,
right: Option<u32>,
) -> heed::Result<
Box<
dyn Iterator<Item = heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c,
>,
>;
fn word_position_last_level(
&self,
word: &str,
in_prefix_cache: bool,
) -> heed::Result<Option<TreeLevel>>;
fn synonyms(&self, word: &str) -> heed::Result<Option<Vec<Vec<String>>>>; fn synonyms(&self, word: &str) -> heed::Result<Option<Vec<Vec<String>>>>;
fn searchable_fields_ids(&self) -> Result<Vec<FieldId>>; fn searchable_fields_ids(&self) -> Result<Vec<FieldId>>;
fn field_id_word_count_docids(&self, field_id: FieldId, word_count: u8) -> heed::Result<Option<RoaringBitmap>>; fn field_id_word_count_docids(
fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> heed::Result<Option<RoaringBitmap>>; &self,
field_id: FieldId,
word_count: u8,
) -> heed::Result<Option<RoaringBitmap>>;
fn word_level_position_docids(
&self,
word: &str,
level: TreeLevel,
left: u32,
right: u32,
) -> heed::Result<Option<RoaringBitmap>>;
} }
pub struct CriteriaBuilder<'t> { pub struct CriteriaBuilder<'t> {
@ -101,12 +138,22 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
self.index.word_prefix_docids.get(self.rtxn, &word) self.index.word_prefix_docids.get(self.rtxn, &word)
} }
fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>> { fn word_pair_proximity_docids(
&self,
left: &str,
right: &str,
proximity: u8,
) -> heed::Result<Option<RoaringBitmap>> {
let key = (left, right, proximity); let key = (left, right, proximity);
self.index.word_pair_proximity_docids.get(self.rtxn, &key) self.index.word_pair_proximity_docids.get(self.rtxn, &key)
} }
fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>> { fn word_prefix_pair_proximity_docids(
&self,
left: &str,
right: &str,
proximity: u8,
) -> heed::Result<Option<RoaringBitmap>> {
let key = (left, right, proximity); let key = (left, right, proximity);
self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key) self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key)
} }
@ -119,7 +166,10 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
self.words_prefixes_fst.contains(word) self.words_prefixes_fst.contains(word)
} }
fn docid_words_positions(&self, docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>> { fn docid_words_positions(
&self,
docid: DocumentId,
) -> heed::Result<HashMap<String, RoaringBitmap>> {
let mut words_positions = HashMap::new(); let mut words_positions = HashMap::new();
for result in self.index.docid_word_positions.prefix_iter(self.rtxn, &(docid, ""))? { for result in self.index.docid_word_positions.prefix_iter(self.rtxn, &(docid, ""))? {
let ((_, word), positions) = result?; let ((_, word), positions) = result?;
@ -134,9 +184,12 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
level: TreeLevel, level: TreeLevel,
in_prefix_cache: bool, in_prefix_cache: bool,
left: Option<u32>, left: Option<u32>,
right: Option<u32> right: Option<u32>,
) -> heed::Result<Box<dyn Iterator<Item = heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c>> ) -> heed::Result<
{ Box<
dyn Iterator<Item = heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c,
>,
> {
let range = { let range = {
let left = left.unwrap_or(u32::min_value()); let left = left.unwrap_or(u32::min_value());
let right = right.unwrap_or(u32::max_value()); let right = right.unwrap_or(u32::max_value());
@ -152,7 +205,11 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
Ok(Box::new(db.range(self.rtxn, &range)?)) Ok(Box::new(db.range(self.rtxn, &range)?))
} }
fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result<Option<TreeLevel>> { fn word_position_last_level(
&self,
word: &str,
in_prefix_cache: bool,
) -> heed::Result<Option<TreeLevel>> {
let range = { let range = {
let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value()); let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value());
let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value()); let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value());
@ -164,7 +221,9 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
}; };
let last_level = db let last_level = db
.remap_data_type::<heed::types::DecodeIgnore>() .remap_data_type::<heed::types::DecodeIgnore>()
.range(self.rtxn, &range)?.last().transpose()? .range(self.rtxn, &range)?
.last()
.transpose()?
.map(|((_, level, _, _), _)| level); .map(|((_, level, _, _), _)| level);
Ok(last_level) Ok(last_level)
@ -181,12 +240,22 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
} }
} }
fn field_id_word_count_docids(&self, field_id: FieldId, word_count: u8) -> heed::Result<Option<RoaringBitmap>> { fn field_id_word_count_docids(
&self,
field_id: FieldId,
word_count: u8,
) -> heed::Result<Option<RoaringBitmap>> {
let key = (field_id, word_count); let key = (field_id, word_count);
self.index.field_id_word_count_docids.get(self.rtxn, &key) self.index.field_id_word_count_docids.get(self.rtxn, &key)
} }
fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> heed::Result<Option<RoaringBitmap>> { fn word_level_position_docids(
&self,
word: &str,
level: TreeLevel,
left: u32,
right: u32,
) -> heed::Result<Option<RoaringBitmap>> {
let key = (word, level, left, right); let key = (word, level, left, right);
self.index.word_level_position_docids.get(self.rtxn, &key) self.index.word_level_position_docids.get(self.rtxn, &key)
} }
@ -204,13 +273,13 @@ impl<'t> CriteriaBuilder<'t> {
query_tree: Option<Operation>, query_tree: Option<Operation>,
primitive_query: Option<Vec<PrimitiveQueryPart>>, primitive_query: Option<Vec<PrimitiveQueryPart>>,
filtered_candidates: Option<RoaringBitmap>, filtered_candidates: Option<RoaringBitmap>,
) -> Result<Final<'t>> ) -> Result<Final<'t>> {
{
use crate::criterion::Criterion as Name; use crate::criterion::Criterion as Name;
let primitive_query = primitive_query.unwrap_or_default(); let primitive_query = primitive_query.unwrap_or_default();
let mut criterion = Box::new(Initial::new(query_tree, filtered_candidates)) as Box<dyn Criterion>; let mut criterion =
Box::new(Initial::new(query_tree, filtered_candidates)) as Box<dyn Criterion>;
for name in self.index.criteria(&self.rtxn)? { for name in self.index.criteria(&self.rtxn)? {
criterion = match name { criterion = match name {
Name::Typo => Box::new(Typo::new(self, criterion)), Name::Typo => Box::new(Typo::new(self, criterion)),
@ -218,8 +287,12 @@ impl<'t> CriteriaBuilder<'t> {
Name::Proximity => Box::new(Proximity::new(self, criterion)), Name::Proximity => Box::new(Proximity::new(self, criterion)),
Name::Attribute => Box::new(Attribute::new(self, criterion)), Name::Attribute => Box::new(Attribute::new(self, criterion)),
Name::Exactness => Box::new(Exactness::new(self, criterion, &primitive_query)?), Name::Exactness => Box::new(Exactness::new(self, criterion, &primitive_query)?),
Name::Asc(field) => Box::new(AscDesc::asc(&self.index, &self.rtxn, criterion, field)?), Name::Asc(field) => {
Name::Desc(field) => Box::new(AscDesc::desc(&self.index, &self.rtxn, criterion, field)?), Box::new(AscDesc::asc(&self.index, &self.rtxn, criterion, field)?)
}
Name::Desc(field) => {
Box::new(AscDesc::desc(&self.index, &self.rtxn, criterion, field)?)
}
}; };
} }
@ -231,21 +304,20 @@ pub fn resolve_query_tree<'t>(
ctx: &'t dyn Context, ctx: &'t dyn Context,
query_tree: &Operation, query_tree: &Operation,
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> Result<RoaringBitmap> ) -> Result<RoaringBitmap> {
{
fn resolve_operation<'t>( fn resolve_operation<'t>(
ctx: &'t dyn Context, ctx: &'t dyn Context,
query_tree: &Operation, query_tree: &Operation,
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> Result<RoaringBitmap> ) -> Result<RoaringBitmap> {
{ use Operation::{And, Or, Phrase, Query};
use Operation::{And, Phrase, Or, Query};
match query_tree { match query_tree {
And(ops) => { And(ops) => {
let mut ops = ops.iter().map(|op| { let mut ops = ops
resolve_operation(ctx, op, wdcache) .iter()
}).collect::<Result<Vec<_>>>()?; .map(|op| resolve_operation(ctx, op, wdcache))
.collect::<Result<Vec<_>>>()?;
ops.sort_unstable_by_key(|cds| cds.len()); ops.sort_unstable_by_key(|cds| cds.len());
@ -260,7 +332,7 @@ pub fn resolve_query_tree<'t>(
} }
} }
Ok(candidates) Ok(candidates)
}, }
Phrase(words) => { Phrase(words) => {
let mut candidates = RoaringBitmap::new(); let mut candidates = RoaringBitmap::new();
let mut first_loop = true; let mut first_loop = true;
@ -276,12 +348,12 @@ pub fn resolve_query_tree<'t>(
} else { } else {
candidates &= pair_docids; candidates &= pair_docids;
} }
}, }
None => return Ok(RoaringBitmap::new()) None => return Ok(RoaringBitmap::new()),
} }
} }
Ok(candidates) Ok(candidates)
}, }
Or(_, ops) => { Or(_, ops) => {
let mut candidates = RoaringBitmap::new(); let mut candidates = RoaringBitmap::new();
for op in ops { for op in ops {
@ -289,7 +361,7 @@ pub fn resolve_query_tree<'t>(
candidates.union_with(&docids); candidates.union_with(&docids);
} }
Ok(candidates) Ok(candidates)
}, }
Query(q) => Ok(query_docids(ctx, q, wdcache)?), Query(q) => Ok(query_docids(ctx, q, wdcache)?),
} }
} }
@ -297,18 +369,18 @@ pub fn resolve_query_tree<'t>(
resolve_operation(ctx, query_tree, wdcache) resolve_operation(ctx, query_tree, wdcache)
} }
fn all_word_pair_proximity_docids<T: AsRef<str>, U: AsRef<str>>( fn all_word_pair_proximity_docids<T: AsRef<str>, U: AsRef<str>>(
ctx: &dyn Context, ctx: &dyn Context,
left_words: &[(T, u8)], left_words: &[(T, u8)],
right_words: &[(U, u8)], right_words: &[(U, u8)],
proximity: u8 proximity: u8,
) -> Result<RoaringBitmap> ) -> Result<RoaringBitmap> {
{
let mut docids = RoaringBitmap::new(); let mut docids = RoaringBitmap::new();
for (left, _l_typo) in left_words { for (left, _l_typo) in left_words {
for (right, _r_typo) in right_words { for (right, _r_typo) in right_words {
let current_docids = ctx.word_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)?.unwrap_or_default(); let current_docids = ctx
.word_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)?
.unwrap_or_default();
docids.union_with(&current_docids); docids.union_with(&current_docids);
} }
} }
@ -319,8 +391,7 @@ fn query_docids(
ctx: &dyn Context, ctx: &dyn Context,
query: &Query, query: &Query,
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> Result<RoaringBitmap> ) -> Result<RoaringBitmap> {
{
match &query.kind { match &query.kind {
QueryKind::Exact { word, .. } => { QueryKind::Exact { word, .. } => {
if query.prefix && ctx.in_prefix_cache(&word) { if query.prefix && ctx.in_prefix_cache(&word) {
@ -336,7 +407,7 @@ fn query_docids(
} else { } else {
Ok(ctx.word_docids(&word)?.unwrap_or_default()) Ok(ctx.word_docids(&word)?.unwrap_or_default())
} }
}, }
QueryKind::Tolerant { typo, word } => { QueryKind::Tolerant { typo, word } => {
let words = word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)?; let words = word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)?;
let mut docids = RoaringBitmap::new(); let mut docids = RoaringBitmap::new();
@ -345,7 +416,7 @@ fn query_docids(
docids.union_with(&current_docids); docids.union_with(&current_docids);
} }
Ok(docids) Ok(docids)
}, }
} }
} }
@ -355,8 +426,7 @@ fn query_pair_proximity_docids(
right: &Query, right: &Query,
proximity: u8, proximity: u8,
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> Result<RoaringBitmap> ) -> Result<RoaringBitmap> {
{
if proximity >= 8 { if proximity >= 8 {
let mut candidates = query_docids(ctx, left, wdcache)?; let mut candidates = query_docids(ctx, left, wdcache)?;
let right_candidates = query_docids(ctx, right, wdcache)?; let right_candidates = query_docids(ctx, right, wdcache)?;
@ -368,20 +438,31 @@ fn query_pair_proximity_docids(
match (&left.kind, &right.kind) { match (&left.kind, &right.kind) {
(QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => { (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => {
if prefix && ctx.in_prefix_cache(&right) { if prefix && ctx.in_prefix_cache(&right) {
Ok(ctx.word_prefix_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?.unwrap_or_default()) Ok(ctx
.word_prefix_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?
.unwrap_or_default())
} else if prefix { } else if prefix {
let r_words = word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?; let r_words = word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?;
all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity)
} else { } else {
Ok(ctx.word_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?.unwrap_or_default()) Ok(ctx
.word_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?
.unwrap_or_default())
}
} }
},
(QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => { (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => {
let l_words = word_derivations(&left, false, *typo, ctx.words_fst(), wdcache)?.to_owned(); let l_words =
word_derivations(&left, false, *typo, ctx.words_fst(), wdcache)?.to_owned();
if prefix && ctx.in_prefix_cache(&right) { if prefix && ctx.in_prefix_cache(&right) {
let mut docids = RoaringBitmap::new(); let mut docids = RoaringBitmap::new();
for (left, _) in l_words { for (left, _) in l_words {
let current_docids = ctx.word_prefix_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)?.unwrap_or_default(); let current_docids = ctx
.word_prefix_pair_proximity_docids(
left.as_ref(),
right.as_ref(),
proximity,
)?
.unwrap_or_default();
docids.union_with(&current_docids); docids.union_with(&current_docids);
} }
Ok(docids) Ok(docids)
@ -391,28 +472,36 @@ fn query_pair_proximity_docids(
} else { } else {
all_word_pair_proximity_docids(ctx, &l_words, &[(right, 0)], proximity) all_word_pair_proximity_docids(ctx, &l_words, &[(right, 0)], proximity)
} }
}, }
(QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => { (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => {
let r_words = word_derivations(&right, prefix, *typo, ctx.words_fst(), wdcache)?; let r_words = word_derivations(&right, prefix, *typo, ctx.words_fst(), wdcache)?;
all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity)
}, }
(QueryKind::Tolerant { typo: l_typo, word: left }, QueryKind::Tolerant { typo: r_typo, word: right }) => { (
let l_words = word_derivations(&left, false, *l_typo, ctx.words_fst(), wdcache)?.to_owned(); QueryKind::Tolerant { typo: l_typo, word: left },
QueryKind::Tolerant { typo: r_typo, word: right },
) => {
let l_words =
word_derivations(&left, false, *l_typo, ctx.words_fst(), wdcache)?.to_owned();
let r_words = word_derivations(&right, prefix, *r_typo, ctx.words_fst(), wdcache)?; let r_words = word_derivations(&right, prefix, *r_typo, ctx.words_fst(), wdcache)?;
all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity) all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity)
}, }
} }
} }
#[cfg(test)] #[cfg(test)]
pub mod test { pub mod test {
use maplit::hashmap;
use rand::{Rng, SeedableRng, rngs::StdRng};
use super::*;
use std::collections::HashMap; use std::collections::HashMap;
fn s(s: &str) -> String { s.to_string() } use maplit::hashmap;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use super::*;
fn s(s: &str) -> String {
s.to_string()
}
pub struct TestContext<'t> { pub struct TestContext<'t> {
words_fst: fst::Set<Cow<'t, [u8]>>, words_fst: fst::Set<Cow<'t, [u8]>>,
word_docids: HashMap<String, RoaringBitmap>, word_docids: HashMap<String, RoaringBitmap>,
@ -435,12 +524,22 @@ pub mod test {
Ok(self.word_prefix_docids.get(&word.to_string()).cloned()) Ok(self.word_prefix_docids.get(&word.to_string()).cloned())
} }
fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>> { fn word_pair_proximity_docids(
&self,
left: &str,
right: &str,
proximity: u8,
) -> heed::Result<Option<RoaringBitmap>> {
let key = (left.to_string(), right.to_string(), proximity.into()); let key = (left.to_string(), right.to_string(), proximity.into());
Ok(self.word_pair_proximity_docids.get(&key).cloned()) Ok(self.word_pair_proximity_docids.get(&key).cloned())
} }
fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>> { fn word_prefix_pair_proximity_docids(
&self,
left: &str,
right: &str,
proximity: u8,
) -> heed::Result<Option<RoaringBitmap>> {
let key = (left.to_string(), right.to_string(), proximity.into()); let key = (left.to_string(), right.to_string(), proximity.into());
Ok(self.word_prefix_pair_proximity_docids.get(&key).cloned()) Ok(self.word_prefix_pair_proximity_docids.get(&key).cloned())
} }
@ -453,24 +552,44 @@ pub mod test {
self.word_prefix_docids.contains_key(&word.to_string()) self.word_prefix_docids.contains_key(&word.to_string())
} }
fn docid_words_positions(&self, docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>> { fn docid_words_positions(
&self,
docid: DocumentId,
) -> heed::Result<HashMap<String, RoaringBitmap>> {
if let Some(docid_words) = self.docid_words.get(&docid) { if let Some(docid_words) = self.docid_words.get(&docid) {
Ok(docid_words Ok(docid_words
.iter() .iter()
.enumerate() .enumerate()
.map(|(i,w)| (w.clone(), RoaringBitmap::from_sorted_iter(std::iter::once(i as u32)))) .map(|(i, w)| {
.collect() (w.clone(), RoaringBitmap::from_sorted_iter(std::iter::once(i as u32)))
) })
.collect())
} else { } else {
Ok(HashMap::new()) Ok(HashMap::new())
} }
} }
fn word_position_iterator(&self, _word: &str, _level: TreeLevel, _in_prefix_cache: bool, _left: Option<u32>, _right: Option<u32>) -> heed::Result<Box<dyn Iterator<Item =heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c>> { fn word_position_iterator(
&self,
_word: &str,
_level: TreeLevel,
_in_prefix_cache: bool,
_left: Option<u32>,
_right: Option<u32>,
) -> heed::Result<
Box<
dyn Iterator<Item = heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>>
+ 'c,
>,
> {
todo!() todo!()
} }
fn word_position_last_level(&self, _word: &str, _in_prefix_cache: bool) -> heed::Result<Option<TreeLevel>> { fn word_position_last_level(
&self,
_word: &str,
_in_prefix_cache: bool,
) -> heed::Result<Option<TreeLevel>> {
todo!() todo!()
} }
@ -482,11 +601,21 @@ pub mod test {
todo!() todo!()
} }
fn word_level_position_docids(&self, _word: &str, _level: TreeLevel, _left: u32, _right: u32) -> heed::Result<Option<RoaringBitmap>> { fn word_level_position_docids(
&self,
_word: &str,
_level: TreeLevel,
_left: u32,
_right: u32,
) -> heed::Result<Option<RoaringBitmap>> {
todo!() todo!()
} }
fn field_id_word_count_docids(&self, _field_id: FieldId, _word_count: u8) -> heed::Result<Option<RoaringBitmap>> { fn field_id_word_count_docids(
&self,
_field_id: FieldId,
_word_count: u8,
) -> heed::Result<Option<RoaringBitmap>> {
todo!() todo!()
} }
} }
@ -540,7 +669,9 @@ pub mod test {
let mut word_prefix_pair_proximity_docids = HashMap::new(); let mut word_prefix_pair_proximity_docids = HashMap::new();
for (lword, lcandidates) in &word_docids { for (lword, lcandidates) in &word_docids {
for (rword, rcandidates) in &word_docids { for (rword, rcandidates) in &word_docids {
if lword == rword { continue } if lword == rword {
continue;
}
let candidates = lcandidates & rcandidates; let candidates = lcandidates & rcandidates;
for candidate in candidates { for candidate in candidates {
if let Some(docid_words) = docid_words.get(&candidate) { if let Some(docid_words) = docid_words.get(&candidate) {
@ -551,24 +682,31 @@ pub mod test {
} else { } else {
(s(lword), s(rword), (lposition - rposition + 1) as i32) (s(lword), s(rword), (lposition - rposition + 1) as i32)
}; };
let docids = word_pair_proximity_docids.entry(key).or_insert(RoaringBitmap::new()); let docids = word_pair_proximity_docids
.entry(key)
.or_insert(RoaringBitmap::new());
docids.push(candidate); docids.push(candidate);
} }
} }
} }
for (pword, pcandidates) in &word_prefix_docids { for (pword, pcandidates) in &word_prefix_docids {
if lword.starts_with(pword) { continue } if lword.starts_with(pword) {
continue;
}
let candidates = lcandidates & pcandidates; let candidates = lcandidates & pcandidates;
for candidate in candidates { for candidate in candidates {
if let Some(docid_words) = docid_words.get(&candidate) { if let Some(docid_words) = docid_words.get(&candidate) {
let lposition = docid_words.iter().position(|w| w == lword).unwrap(); let lposition = docid_words.iter().position(|w| w == lword).unwrap();
let rposition = docid_words.iter().position(|w| w.starts_with(pword)).unwrap(); let rposition =
docid_words.iter().position(|w| w.starts_with(pword)).unwrap();
let key = if lposition < rposition { let key = if lposition < rposition {
(s(lword), s(pword), (rposition - lposition) as i32) (s(lword), s(pword), (rposition - lposition) as i32)
} else { } else {
(s(lword), s(pword), (lposition - rposition + 1) as i32) (s(lword), s(pword), (lposition - rposition + 1) as i32)
}; };
let docids = word_prefix_pair_proximity_docids.entry(key).or_insert(RoaringBitmap::new()); let docids = word_prefix_pair_proximity_docids
.entry(key)
.or_insert(RoaringBitmap::new());
docids.push(candidate); docids.push(candidate);
} }
} }

View File

@ -2,22 +2,16 @@ use std::collections::btree_map::{self, BTreeMap};
use std::collections::hash_map::HashMap; use std::collections::hash_map::HashMap;
use std::mem::take; use std::mem::take;
use roaring::RoaringBitmap;
use log::debug; use log::debug;
use roaring::RoaringBitmap;
use crate::search::query_tree::{maximum_proximity, Operation, Query};
use crate::search::{build_dfa, WordDerivationsCache};
use crate::search::{query_tree::QueryKind};
use crate::{DocumentId, Position, Result};
use super::{ use super::{
Context, query_docids, query_pair_proximity_docids, resolve_query_tree, Context, Criterion,
Criterion, CriterionParameters, CriterionResult,
CriterionParameters,
CriterionResult,
query_docids,
query_pair_proximity_docids,
resolve_query_tree,
}; };
use crate::search::query_tree::{maximum_proximity, Operation, Query, QueryKind};
use crate::search::{build_dfa, WordDerivationsCache};
use crate::{DocumentId, Position, Result};
type Cache = HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>; type Cache = HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>;
@ -63,28 +57,33 @@ impl<'t> Criterion for Proximity<'t> {
} }
loop { loop {
debug!("Proximity at iteration {} (max prox {:?}) ({:?})", debug!(
"Proximity at iteration {} (max prox {:?}) ({:?})",
self.proximity, self.proximity,
self.state.as_ref().map(|(mp, _, _)| mp), self.state.as_ref().map(|(mp, _, _)| mp),
self.state.as_ref().map(|(_, _, cd)| cd), self.state.as_ref().map(|(_, _, cd)| cd),
); );
match &mut self.state { match &mut self.state {
Some((max_prox, _, allowed_candidates)) if allowed_candidates.is_empty() || self.proximity > *max_prox => { Some((max_prox, _, allowed_candidates))
if allowed_candidates.is_empty() || self.proximity > *max_prox =>
{
self.state = None; // reset state self.state = None; // reset state
}, }
Some((_, query_tree, allowed_candidates)) => { Some((_, query_tree, allowed_candidates)) => {
let mut new_candidates = if allowed_candidates.len() <= CANDIDATES_THRESHOLD && self.proximity > PROXIMITY_THRESHOLD { let mut new_candidates = if allowed_candidates.len() <= CANDIDATES_THRESHOLD
&& self.proximity > PROXIMITY_THRESHOLD
{
if let Some(cache) = self.plane_sweep_cache.as_mut() { if let Some(cache) = self.plane_sweep_cache.as_mut() {
match cache.next() { match cache.next() {
Some((p, candidates)) => { Some((p, candidates)) => {
self.proximity = p; self.proximity = p;
candidates candidates
}, }
None => { None => {
self.state = None; // reset state self.state = None; // reset state
continue continue;
}, }
} }
} else { } else {
let cache = resolve_plane_sweep_candidates( let cache = resolve_plane_sweep_candidates(
@ -95,9 +94,10 @@ impl<'t> Criterion for Proximity<'t> {
)?; )?;
self.plane_sweep_cache = Some(cache.into_iter()); self.plane_sweep_cache = Some(cache.into_iter());
continue continue;
} }
} else { // use set theory based algorithm } else {
// use set theory based algorithm
resolve_candidates( resolve_candidates(
self.ctx, self.ctx,
&query_tree, &query_tree,
@ -117,13 +117,20 @@ impl<'t> Criterion for Proximity<'t> {
filtered_candidates: None, filtered_candidates: None,
bucket_candidates: Some(take(&mut self.bucket_candidates)), bucket_candidates: Some(take(&mut self.bucket_candidates)),
})); }));
}, }
None => { None => match self.parent.next(params)? {
match self.parent.next(params)? { Some(CriterionResult {
Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => { query_tree: Some(query_tree),
candidates,
filtered_candidates,
bucket_candidates,
}) => {
let mut candidates = match candidates { let mut candidates = match candidates {
Some(candidates) => candidates, Some(candidates) => candidates,
None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates, None => {
resolve_query_tree(self.ctx, &query_tree, params.wdcache)?
- params.excluded_candidates
}
}; };
if let Some(filtered_candidates) = filtered_candidates { if let Some(filtered_candidates) = filtered_candidates {
@ -139,17 +146,21 @@ impl<'t> Criterion for Proximity<'t> {
self.state = Some((maximum_proximity as u8, query_tree, candidates)); self.state = Some((maximum_proximity as u8, query_tree, candidates));
self.proximity = 0; self.proximity = 0;
self.plane_sweep_cache = None; self.plane_sweep_cache = None;
}, }
Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => { Some(CriterionResult {
query_tree: None,
candidates,
filtered_candidates,
bucket_candidates,
}) => {
return Ok(Some(CriterionResult { return Ok(Some(CriterionResult {
query_tree: None, query_tree: None,
candidates, candidates,
filtered_candidates, filtered_candidates,
bucket_candidates, bucket_candidates,
})); }));
},
None => return Ok(None),
} }
None => return Ok(None),
}, },
} }
} }
@ -162,32 +173,33 @@ fn resolve_candidates<'t>(
proximity: u8, proximity: u8,
cache: &mut Cache, cache: &mut Cache,
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> Result<RoaringBitmap> ) -> Result<RoaringBitmap> {
{
fn resolve_operation<'t>( fn resolve_operation<'t>(
ctx: &'t dyn Context, ctx: &'t dyn Context,
query_tree: &Operation, query_tree: &Operation,
proximity: u8, proximity: u8,
cache: &mut Cache, cache: &mut Cache,
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> Result<Vec<(Query, Query, RoaringBitmap)>> ) -> Result<Vec<(Query, Query, RoaringBitmap)>> {
{ use Operation::{And, Or, Phrase};
use Operation::{And, Phrase, Or};
let result = match query_tree { let result = match query_tree {
And(ops) => mdfs(ctx, ops, proximity, cache, wdcache)?, And(ops) => mdfs(ctx, ops, proximity, cache, wdcache)?,
Phrase(words) => if proximity == 0 { Phrase(words) => {
let most_left = words.first().map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); if proximity == 0 {
let most_right = words.last().map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); let most_left = words
.first()
.map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
let most_right = words
.last()
.map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
let mut candidates = None; let mut candidates = None;
for slice in words.windows(2) { for slice in words.windows(2) {
let (left, right) = (&slice[0], &slice[1]); let (left, right) = (&slice[0], &slice[1]);
match ctx.word_pair_proximity_docids(left, right, 1)? { match ctx.word_pair_proximity_docids(left, right, 1)? {
Some(pair_docids) => { Some(pair_docids) => match candidates.as_mut() {
match candidates.as_mut() {
Some(candidates) => *candidates &= pair_docids, Some(candidates) => *candidates &= pair_docids,
None => candidates = Some(pair_docids), None => candidates = Some(pair_docids),
}
}, },
None => { None => {
candidates = None; candidates = None;
@ -201,7 +213,8 @@ fn resolve_candidates<'t>(
} }
} else { } else {
Default::default() Default::default()
}, }
}
Or(_, ops) => { Or(_, ops) => {
let mut output = Vec::new(); let mut output = Vec::new();
for op in ops { for op in ops {
@ -209,13 +222,15 @@ fn resolve_candidates<'t>(
output.extend(result); output.extend(result);
} }
output output
}, }
Operation::Query(q) => if proximity == 0 { Operation::Query(q) => {
if proximity == 0 {
let candidates = query_docids(ctx, q, wdcache)?; let candidates = query_docids(ctx, q, wdcache)?;
vec![(q.clone(), q.clone(), candidates)] vec![(q.clone(), q.clone(), candidates)]
} else { } else {
Default::default() Default::default()
}, }
}
}; };
Ok(result) Ok(result)
@ -228,8 +243,7 @@ fn resolve_candidates<'t>(
proximity: u8, proximity: u8,
cache: &mut Cache, cache: &mut Cache,
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> Result<Vec<(Query, Query, RoaringBitmap)>> ) -> Result<Vec<(Query, Query, RoaringBitmap)>> {
{
fn pair_combinations(mana: u8, left_max: u8) -> impl Iterator<Item = (u8, u8)> { fn pair_combinations(mana: u8, left_max: u8) -> impl Iterator<Item = (u8, u8)> {
(0..=mana.min(left_max)).map(move |m| (m, mana - m)) (0..=mana.min(left_max)).map(move |m| (m, mana - m))
} }
@ -257,7 +271,8 @@ fn resolve_candidates<'t>(
for (ll, lr, lcandidates) in lefts { for (ll, lr, lcandidates) in lefts {
for (rl, rr, rcandidates) in rights { for (rl, rr, rcandidates) in rights {
let mut candidates = query_pair_proximity_docids(ctx, lr, rl, pair_p + 1, wdcache)?; let mut candidates =
query_pair_proximity_docids(ctx, lr, rl, pair_p + 1, wdcache)?;
if lcandidates.len() < rcandidates.len() { if lcandidates.len() < rcandidates.len() {
candidates.intersect_with(lcandidates); candidates.intersect_with(lcandidates);
candidates.intersect_with(rcandidates); candidates.intersect_with(rcandidates);
@ -282,22 +297,26 @@ fn resolve_candidates<'t>(
proximity: u8, proximity: u8,
cache: &mut Cache, cache: &mut Cache,
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> Result<Vec<(Query, Query, RoaringBitmap)>> ) -> Result<Vec<(Query, Query, RoaringBitmap)>> {
{
// Extract the first two elements but gives the tail // Extract the first two elements but gives the tail
// that is just after the first element. // that is just after the first element.
let next = branches.split_first().map(|(h1, t)| { let next =
(h1, t.split_first().map(|(h2, _)| (h2, t))) branches.split_first().map(|(h1, t)| (h1, t.split_first().map(|(h2, _)| (h2, t))));
});
match next { match next {
Some((head1, Some((head2, [_])))) => mdfs_pair(ctx, head1, head2, proximity, cache, wdcache), Some((head1, Some((head2, [_])))) => {
mdfs_pair(ctx, head1, head2, proximity, cache, wdcache)
}
Some((head1, Some((head2, tail)))) => { Some((head1, Some((head2, tail)))) => {
let mut output = Vec::new(); let mut output = Vec::new();
for p in 0..=proximity { for p in 0..=proximity {
for (lhead, _, head_candidates) in mdfs_pair(ctx, head1, head2, p, cache, wdcache)? { for (lhead, _, head_candidates) in
mdfs_pair(ctx, head1, head2, p, cache, wdcache)?
{
if !head_candidates.is_empty() { if !head_candidates.is_empty() {
for (_, rtail, mut candidates) in mdfs(ctx, tail, proximity - p, cache, wdcache)? { for (_, rtail, mut candidates) in
mdfs(ctx, tail, proximity - p, cache, wdcache)?
{
candidates.intersect_with(&head_candidates); candidates.intersect_with(&head_candidates);
if !candidates.is_empty() { if !candidates.is_empty() {
output.push((lhead.clone(), rtail, candidates)); output.push((lhead.clone(), rtail, candidates));
@ -307,7 +326,7 @@ fn resolve_candidates<'t>(
} }
} }
Ok(output) Ok(output)
}, }
Some((head1, None)) => resolve_operation(ctx, head1, proximity, cache, wdcache), Some((head1, None)) => resolve_operation(ctx, head1, proximity, cache, wdcache),
None => Ok(Default::default()), None => Ok(Default::default()),
} }
@ -325,47 +344,48 @@ fn resolve_plane_sweep_candidates(
query_tree: &Operation, query_tree: &Operation,
allowed_candidates: &RoaringBitmap, allowed_candidates: &RoaringBitmap,
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> Result<BTreeMap<u8, RoaringBitmap>> ) -> Result<BTreeMap<u8, RoaringBitmap>> {
{
/// FIXME may be buggy with query like "new new york" /// FIXME may be buggy with query like "new new york"
fn plane_sweep( fn plane_sweep(
groups_positions: Vec<Vec<(Position, u8, Position)>>, groups_positions: Vec<Vec<(Position, u8, Position)>>,
consecutive: bool, consecutive: bool,
) -> Result<Vec<(Position, u8, Position)>> ) -> Result<Vec<(Position, u8, Position)>> {
{
fn compute_groups_proximity( fn compute_groups_proximity(
groups: &[(usize, (Position, u8, Position))], groups: &[(usize, (Position, u8, Position))],
consecutive: bool, consecutive: bool,
) -> Option<(Position, u8, Position)> ) -> Option<(Position, u8, Position)> {
{
// take the inner proximity of the first group as initial // take the inner proximity of the first group as initial
let (_, (_, mut proximity, _)) = groups.first()?; let (_, (_, mut proximity, _)) = groups.first()?;
let (_, (left_most_pos, _, _)) = groups.first()?; let (_, (left_most_pos, _, _)) = groups.first()?;
let (_, (_, _, right_most_pos)) = groups.iter().max_by_key(|(_, (_, _, right_most_pos))| right_most_pos)?; let (_, (_, _, right_most_pos)) =
groups.iter().max_by_key(|(_, (_, _, right_most_pos))| right_most_pos)?;
for pair in groups.windows(2) { for pair in groups.windows(2) {
if let [(i1, (lpos1, _, rpos1)), (i2, (lpos2, prox2, rpos2))] = pair { if let [(i1, (lpos1, _, rpos1)), (i2, (lpos2, prox2, rpos2))] = pair {
// if two positions are equal, meaning that they share at least a word, we return None // if two positions are equal, meaning that they share at least a word, we return None
if rpos1 == rpos2 || lpos1 == lpos2 || rpos1 == lpos2 || lpos1 == rpos2 { if rpos1 == rpos2 || lpos1 == lpos2 || rpos1 == lpos2 || lpos1 == rpos2 {
return None return None;
} }
let pair_proximity = { let pair_proximity = {
// if intervals are disjoint [..].(..) // if intervals are disjoint [..].(..)
if lpos2 > rpos1 { lpos2 - rpos1 } if lpos2 > rpos1 {
lpos2 - rpos1
}
// if the second interval is a subset of the first [.(..).] // if the second interval is a subset of the first [.(..).]
else if rpos2 < rpos1 { (lpos2 - lpos1).min(rpos1 - rpos2) } else if rpos2 < rpos1 {
(lpos2 - lpos1).min(rpos1 - rpos2)
}
// if intervals overlaps [.(..].) // if intervals overlaps [.(..].)
else { (lpos2 - lpos1).min(rpos2 - rpos1) } else {
(lpos2 - lpos1).min(rpos2 - rpos1)
}
}; };
// if groups are in the good order (query order) we remove 1 to the proximity // if groups are in the good order (query order) we remove 1 to the proximity
// the proximity is clamped to 7 // the proximity is clamped to 7
let pair_proximity = if i1 < i2 { let pair_proximity =
(pair_proximity - 1).min(7) if i1 < i2 { (pair_proximity - 1).min(7) } else { pair_proximity.min(7) };
} else {
pair_proximity.min(7)
};
proximity += pair_proximity as u8 + prox2; proximity += pair_proximity as u8 + prox2;
} }
@ -381,7 +401,8 @@ fn resolve_plane_sweep_candidates(
let groups_len = groups_positions.len(); let groups_len = groups_positions.len();
let mut groups_positions: Vec<_> = groups_positions.into_iter().map(|pos| pos.into_iter()).collect(); let mut groups_positions: Vec<_> =
groups_positions.into_iter().map(|pos| pos.into_iter()).collect();
// Pop top elements of each list. // Pop top elements of each list.
let mut current = Vec::with_capacity(groups_len); let mut current = Vec::with_capacity(groups_len);
@ -452,9 +473,8 @@ fn resolve_plane_sweep_candidates(
rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>, rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>,
words_positions: &HashMap<String, RoaringBitmap>, words_positions: &HashMap<String, RoaringBitmap>,
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> Result<Vec<(Position, u8, Position)>> ) -> Result<Vec<(Position, u8, Position)>> {
{ use Operation::{And, Or, Phrase};
use Operation::{And, Phrase, Or};
if let Some(result) = rocache.get(query_tree) { if let Some(result) = rocache.get(query_tree) {
return Ok(result.clone()); return Ok(result.clone());
@ -464,11 +484,18 @@ fn resolve_plane_sweep_candidates(
And(ops) => { And(ops) => {
let mut groups_positions = Vec::with_capacity(ops.len()); let mut groups_positions = Vec::with_capacity(ops.len());
for operation in ops { for operation in ops {
let positions = resolve_operation(ctx, operation, docid, rocache, words_positions, wdcache)?; let positions = resolve_operation(
ctx,
operation,
docid,
rocache,
words_positions,
wdcache,
)?;
groups_positions.push(positions); groups_positions.push(positions);
} }
plane_sweep(groups_positions, false)? plane_sweep(groups_positions, false)?
}, }
Phrase(words) => { Phrase(words) => {
let mut groups_positions = Vec::with_capacity(words.len()); let mut groups_positions = Vec::with_capacity(words.len());
for word in words { for word in words {
@ -479,16 +506,23 @@ fn resolve_plane_sweep_candidates(
groups_positions.push(positions); groups_positions.push(positions);
} }
plane_sweep(groups_positions, true)? plane_sweep(groups_positions, true)?
}, }
Or(_, ops) => { Or(_, ops) => {
let mut result = Vec::new(); let mut result = Vec::new();
for op in ops { for op in ops {
result.extend(resolve_operation(ctx, op, docid, rocache, words_positions, wdcache)?) result.extend(resolve_operation(
ctx,
op,
docid,
rocache,
words_positions,
wdcache,
)?)
} }
result.sort_unstable(); result.sort_unstable();
result result
}, }
Operation::Query(Query { prefix, kind }) => { Operation::Query(Query { prefix, kind }) => {
let mut result = Vec::new(); let mut result = Vec::new();
match kind { match kind {
@ -500,7 +534,7 @@ fn resolve_plane_sweep_candidates(
} else if let Some(positions) = words_positions.get(word) { } else if let Some(positions) = words_positions.get(word) {
result.extend(positions.iter().map(|p| (p, 0, p))); result.extend(positions.iter().map(|p| (p, 0, p)));
} }
}, }
QueryKind::Tolerant { typo, word } => { QueryKind::Tolerant { typo, word } => {
let iter = word_derivations(word, *prefix, *typo, &words_positions) let iter = word_derivations(word, *prefix, *typo, &words_positions)
.flat_map(|positions| positions.iter().map(|p| (p, 0, p))); .flat_map(|positions| positions.iter().map(|p| (p, 0, p)));
@ -522,8 +556,7 @@ fn resolve_plane_sweep_candidates(
is_prefix: bool, is_prefix: bool,
max_typo: u8, max_typo: u8,
words_positions: &'a HashMap<String, RoaringBitmap>, words_positions: &'a HashMap<String, RoaringBitmap>,
) -> impl Iterator<Item = &'a RoaringBitmap> ) -> impl Iterator<Item = &'a RoaringBitmap> {
{
let dfa = build_dfa(word, max_typo, is_prefix); let dfa = build_dfa(word, max_typo, is_prefix);
words_positions.iter().filter_map(move |(document_word, positions)| { words_positions.iter().filter_map(move |(document_word, positions)| {
use levenshtein_automata::Distance; use levenshtein_automata::Distance;

View File

@ -1,20 +1,17 @@
use std::{borrow::Cow, collections::HashMap, mem::take}; use std::borrow::Cow;
use std::collections::HashMap;
use std::mem::take;
use log::debug; use log::debug;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use super::{
query_docids, resolve_query_tree, Candidates, Context, Criterion, CriterionParameters,
CriterionResult,
};
use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind}; use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind};
use crate::search::{word_derivations, WordDerivationsCache}; use crate::search::{word_derivations, WordDerivationsCache};
use crate::Result; use crate::Result;
use super::{
Candidates,
Context,
Criterion,
CriterionParameters,
CriterionResult,
query_docids,
resolve_query_tree,
};
/// Maximum number of typo for a word of any length. /// Maximum number of typo for a word of any length.
const MAX_TYPOS_PER_WORD: u8 = 2; const MAX_TYPOS_PER_WORD: u8 = 2;
@ -54,7 +51,8 @@ impl<'t> Criterion for Typo<'t> {
} }
loop { loop {
debug!("Typo at iteration {} (max typos {:?}) ({:?})", debug!(
"Typo at iteration {} (max typos {:?}) ({:?})",
self.typos, self.typos,
self.state.as_ref().map(|(mt, _, _)| mt), self.state.as_ref().map(|(mt, _, _)| mt),
self.state.as_ref().map(|(_, _, cd)| cd), self.state.as_ref().map(|(_, _, cd)| cd),
@ -63,29 +61,42 @@ impl<'t> Criterion for Typo<'t> {
match self.state.as_mut() { match self.state.as_mut() {
Some((max_typos, _, _)) if self.typos > *max_typos => { Some((max_typos, _, _)) if self.typos > *max_typos => {
self.state = None; // reset state self.state = None; // reset state
}, }
Some((_, _, Allowed(allowed_candidates))) if allowed_candidates.is_empty() => { Some((_, _, Allowed(allowed_candidates))) if allowed_candidates.is_empty() => {
self.state = None; // reset state self.state = None; // reset state
}, }
Some((_, query_tree, candidates_authorization)) => { Some((_, query_tree, candidates_authorization)) => {
let fst = self.ctx.words_fst(); let fst = self.ctx.words_fst();
let new_query_tree = match self.typos { let new_query_tree = match self.typos {
typos if typos < MAX_TYPOS_PER_WORD => { typos if typos < MAX_TYPOS_PER_WORD => alterate_query_tree(
alterate_query_tree(&fst, query_tree.clone(), self.typos, params.wdcache)? &fst,
}, query_tree.clone(),
self.typos,
params.wdcache,
)?,
MAX_TYPOS_PER_WORD => { MAX_TYPOS_PER_WORD => {
// When typos >= MAX_TYPOS_PER_WORD, no more alteration of the query tree is possible, // When typos >= MAX_TYPOS_PER_WORD, no more alteration of the query tree is possible,
// we keep the altered query tree // we keep the altered query tree
*query_tree = alterate_query_tree(&fst, query_tree.clone(), self.typos, params.wdcache)?; *query_tree = alterate_query_tree(
&fst,
query_tree.clone(),
self.typos,
params.wdcache,
)?;
// we compute the allowed candidates // we compute the allowed candidates
let query_tree_allowed_candidates = resolve_query_tree(self.ctx, query_tree, params.wdcache)?; let query_tree_allowed_candidates =
resolve_query_tree(self.ctx, query_tree, params.wdcache)?;
// we assign the allowed candidates to the candidates authorization. // we assign the allowed candidates to the candidates authorization.
*candidates_authorization = match take(candidates_authorization) { *candidates_authorization = match take(candidates_authorization) {
Allowed(allowed_candidates) => Allowed(query_tree_allowed_candidates & allowed_candidates), Allowed(allowed_candidates) => {
Forbidden(forbidden_candidates) => Allowed(query_tree_allowed_candidates - forbidden_candidates), Allowed(query_tree_allowed_candidates & allowed_candidates)
}
Forbidden(forbidden_candidates) => {
Allowed(query_tree_allowed_candidates - forbidden_candidates)
}
}; };
query_tree.clone() query_tree.clone()
}, }
_otherwise => query_tree.clone(), _otherwise => query_tree.clone(),
}; };
@ -101,11 +112,11 @@ impl<'t> Criterion for Typo<'t> {
Allowed(allowed_candidates) => { Allowed(allowed_candidates) => {
candidates &= &*allowed_candidates; candidates &= &*allowed_candidates;
*allowed_candidates -= &candidates; *allowed_candidates -= &candidates;
}, }
Forbidden(forbidden_candidates) => { Forbidden(forbidden_candidates) => {
candidates -= &*forbidden_candidates; candidates -= &*forbidden_candidates;
*forbidden_candidates |= &candidates; *forbidden_candidates |= &candidates;
}, }
} }
let bucket_candidates = match self.bucket_candidates.as_mut() { let bucket_candidates = match self.bucket_candidates.as_mut() {
@ -121,35 +132,45 @@ impl<'t> Criterion for Typo<'t> {
filtered_candidates: None, filtered_candidates: None,
bucket_candidates: Some(bucket_candidates), bucket_candidates: Some(bucket_candidates),
})); }));
}, }
None => { None => match self.parent.next(params)? {
match self.parent.next(params)? { Some(CriterionResult {
Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => { query_tree: Some(query_tree),
self.bucket_candidates = match (self.bucket_candidates.take(), bucket_candidates) { candidates,
filtered_candidates,
bucket_candidates,
}) => {
self.bucket_candidates =
match (self.bucket_candidates.take(), bucket_candidates) {
(Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc), (Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc),
(self_bc, parent_bc) => self_bc.or(parent_bc), (self_bc, parent_bc) => self_bc.or(parent_bc),
}; };
let candidates = match candidates.or(filtered_candidates) { let candidates = match candidates.or(filtered_candidates) {
Some(candidates) => Candidates::Allowed(candidates - params.excluded_candidates), Some(candidates) => {
Candidates::Allowed(candidates - params.excluded_candidates)
}
None => Candidates::Forbidden(params.excluded_candidates.clone()), None => Candidates::Forbidden(params.excluded_candidates.clone()),
}; };
let maximum_typos = maximum_typo(&query_tree) as u8; let maximum_typos = maximum_typo(&query_tree) as u8;
self.state = Some((maximum_typos, query_tree, candidates)); self.state = Some((maximum_typos, query_tree, candidates));
self.typos = 0; self.typos = 0;
}
}, Some(CriterionResult {
Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => { query_tree: None,
candidates,
filtered_candidates,
bucket_candidates,
}) => {
return Ok(Some(CriterionResult { return Ok(Some(CriterionResult {
query_tree: None, query_tree: None,
candidates, candidates,
filtered_candidates, filtered_candidates,
bucket_candidates, bucket_candidates,
})); }));
},
None => return Ok(None),
} }
None => return Ok(None),
}, },
} }
} }
@ -164,21 +185,19 @@ fn alterate_query_tree(
mut query_tree: Operation, mut query_tree: Operation,
number_typos: u8, number_typos: u8,
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> Result<Operation> ) -> Result<Operation> {
{
fn recurse( fn recurse(
words_fst: &fst::Set<Cow<[u8]>>, words_fst: &fst::Set<Cow<[u8]>>,
operation: &mut Operation, operation: &mut Operation,
number_typos: u8, number_typos: u8,
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> Result<()> ) -> Result<()> {
{ use Operation::{And, Or, Phrase};
use Operation::{And, Phrase, Or};
match operation { match operation {
And(ops) | Or(_, ops) => { And(ops) | Or(_, ops) => {
ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, wdcache)) ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, wdcache))
}, }
// Because Phrases don't allow typos, no alteration can be done. // Because Phrases don't allow typos, no alteration can be done.
Phrase(_words) => return Ok(()), Phrase(_words) => return Ok(()),
Operation::Query(q) => { Operation::Query(q) => {
@ -193,19 +212,25 @@ fn alterate_query_tree(
} else { } else {
let typo = *typo.min(&number_typos); let typo = *typo.min(&number_typos);
let words = word_derivations(word, q.prefix, typo, words_fst, wdcache)?; let words = word_derivations(word, q.prefix, typo, words_fst, wdcache)?;
let queries = words.iter().map(|(word, typo)| { let queries = words
.iter()
.map(|(word, typo)| {
Operation::Query(Query { Operation::Query(Query {
prefix: false, prefix: false,
kind: QueryKind::Exact { original_typo: *typo, word: word.to_string() }, kind: QueryKind::Exact {
original_typo: *typo,
word: word.to_string(),
},
}) })
}).collect(); })
.collect();
*operation = Operation::or(false, queries); *operation = Operation::or(false, queries);
} }
} }
Ok(()) Ok(())
}, }
} }
} }
@ -219,22 +244,18 @@ fn resolve_candidates<'t>(
number_typos: u8, number_typos: u8,
cache: &mut HashMap<(Operation, u8), RoaringBitmap>, cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> Result<RoaringBitmap> ) -> Result<RoaringBitmap> {
{
fn resolve_operation<'t>( fn resolve_operation<'t>(
ctx: &'t dyn Context, ctx: &'t dyn Context,
query_tree: &Operation, query_tree: &Operation,
number_typos: u8, number_typos: u8,
cache: &mut HashMap<(Operation, u8), RoaringBitmap>, cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> Result<RoaringBitmap> ) -> Result<RoaringBitmap> {
{ use Operation::{And, Or, Phrase, Query};
use Operation::{And, Phrase, Or, Query};
match query_tree { match query_tree {
And(ops) => { And(ops) => mdfs(ctx, ops, number_typos, cache, wdcache),
mdfs(ctx, ops, number_typos, cache, wdcache)
},
Phrase(words) => { Phrase(words) => {
let mut candidates = RoaringBitmap::new(); let mut candidates = RoaringBitmap::new();
let mut first_loop = true; let mut first_loop = true;
@ -250,12 +271,12 @@ fn resolve_candidates<'t>(
} else { } else {
candidates &= pair_docids; candidates &= pair_docids;
} }
}, }
None => return Ok(RoaringBitmap::new()) None => return Ok(RoaringBitmap::new()),
} }
} }
Ok(candidates) Ok(candidates)
}, }
Or(_, ops) => { Or(_, ops) => {
let mut candidates = RoaringBitmap::new(); let mut candidates = RoaringBitmap::new();
for op in ops { for op in ops {
@ -263,12 +284,14 @@ fn resolve_candidates<'t>(
candidates.union_with(&docids); candidates.union_with(&docids);
} }
Ok(candidates) Ok(candidates)
}, }
Query(q) => if q.kind.typo() == number_typos { Query(q) => {
if q.kind.typo() == number_typos {
Ok(query_docids(ctx, q, wdcache)?) Ok(query_docids(ctx, q, wdcache)?)
} else { } else {
Ok(RoaringBitmap::new()) Ok(RoaringBitmap::new())
}, }
}
} }
} }
@ -278,8 +301,7 @@ fn resolve_candidates<'t>(
mana: u8, mana: u8,
cache: &mut HashMap<(Operation, u8), RoaringBitmap>, cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> Result<RoaringBitmap> ) -> Result<RoaringBitmap> {
{
match branches.split_first() { match branches.split_first() {
Some((head, [])) => { Some((head, [])) => {
let cache_key = (head.clone(), mana); let cache_key = (head.clone(), mana);
@ -290,7 +312,7 @@ fn resolve_candidates<'t>(
cache.insert(cache_key, candidates.clone()); cache.insert(cache_key, candidates.clone());
Ok(candidates) Ok(candidates)
} }
}, }
Some((head, tail)) => { Some((head, tail)) => {
let mut candidates = RoaringBitmap::new(); let mut candidates = RoaringBitmap::new();
@ -313,7 +335,7 @@ fn resolve_candidates<'t>(
} }
Ok(candidates) Ok(candidates)
}, }
None => Ok(RoaringBitmap::new()), None => Ok(RoaringBitmap::new()),
} }
} }
@ -323,9 +345,9 @@ fn resolve_candidates<'t>(
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use super::*;
use super::super::initial::Initial; use super::super::initial::Initial;
use super::super::test::TestContext; use super::super::test::TestContext;
use super::*;
#[test] #[test]
fn initial_placeholder_no_facets() { fn initial_placeholder_no_facets() {
@ -348,13 +370,23 @@ mod test {
#[test] #[test]
fn initial_query_tree_no_facets() { fn initial_query_tree_no_facets() {
let context = TestContext::default(); let context = TestContext::default();
let query_tree = Operation::Or(false, vec![ let query_tree = Operation::Or(
Operation::And(vec![ false,
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), vec![Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), Operation::Query(Query {
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "world".to_string()) }), prefix: false,
]) kind: QueryKind::exact("split".to_string()),
]); }),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("this".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "world".to_string()),
}),
])],
);
let facet_candidates = None; let facet_candidates = None;
@ -369,13 +401,23 @@ mod test {
& context.word_docids("this").unwrap().unwrap() & context.word_docids("this").unwrap().unwrap()
& context.word_docids("world").unwrap().unwrap(); & context.word_docids("world").unwrap().unwrap();
let expected_1 = CriterionResult { let expected_1 = CriterionResult {
query_tree: Some(Operation::Or(false, vec![ query_tree: Some(Operation::Or(
Operation::And(vec![ false,
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), vec![Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), Operation::Query(Query {
Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), prefix: false,
]), kind: QueryKind::exact("split".to_string()),
])), }),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("this".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("world".to_string()),
}),
])],
)),
candidates: Some(candidates_1.clone()), candidates: Some(candidates_1.clone()),
bucket_candidates: Some(candidates_1), bucket_candidates: Some(candidates_1),
filtered_candidates: None, filtered_candidates: None,
@ -383,22 +425,37 @@ mod test {
assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1)); assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1));
let candidates_2 = ( let candidates_2 = (context.word_docids("split").unwrap().unwrap()
context.word_docids("split").unwrap().unwrap()
& context.word_docids("this").unwrap().unwrap() & context.word_docids("this").unwrap().unwrap()
& context.word_docids("word").unwrap().unwrap() & context.word_docids("word").unwrap().unwrap())
) - context.word_docids("world").unwrap().unwrap(); - context.word_docids("world").unwrap().unwrap();
let expected_2 = CriterionResult { let expected_2 = CriterionResult {
query_tree: Some(Operation::Or(false, vec![ query_tree: Some(Operation::Or(
Operation::And(vec![ false,
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), vec![Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), Operation::Query(Query {
Operation::Or(false, vec![ prefix: false,
Operation::Query(Query { prefix: false, kind: QueryKind::exact_with_typo(1, "word".to_string()) }), kind: QueryKind::exact("split".to_string()),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), }),
]), Operation::Query(Query {
]), prefix: false,
])), kind: QueryKind::exact("this".to_string()),
}),
Operation::Or(
false,
vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact_with_typo(1, "word".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("world".to_string()),
}),
],
),
])],
)),
candidates: Some(candidates_2.clone()), candidates: Some(candidates_2.clone()),
bucket_candidates: Some(candidates_2), bucket_candidates: Some(candidates_2),
filtered_candidates: None, filtered_candidates: None,
@ -437,17 +494,26 @@ mod test {
#[test] #[test]
fn initial_query_tree_with_facets() { fn initial_query_tree_with_facets() {
let context = TestContext::default(); let context = TestContext::default();
let query_tree = Operation::Or(false, vec![ let query_tree = Operation::Or(
Operation::And(vec![ false,
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), vec![Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), Operation::Query(Query {
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "world".to_string()) }), prefix: false,
]) kind: QueryKind::exact("split".to_string()),
]); }),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("this".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "world".to_string()),
}),
])],
);
let facet_candidates = context.word_docids("earth").unwrap().unwrap(); let facet_candidates = context.word_docids("earth").unwrap().unwrap();
let mut criterion_parameters = CriterionParameters { let mut criterion_parameters = CriterionParameters {
wdcache: &mut WordDerivationsCache::new(), wdcache: &mut WordDerivationsCache::new(),
excluded_candidates: &RoaringBitmap::new(), excluded_candidates: &RoaringBitmap::new(),
@ -459,13 +525,23 @@ mod test {
& context.word_docids("this").unwrap().unwrap() & context.word_docids("this").unwrap().unwrap()
& context.word_docids("world").unwrap().unwrap(); & context.word_docids("world").unwrap().unwrap();
let expected_1 = CriterionResult { let expected_1 = CriterionResult {
query_tree: Some(Operation::Or(false, vec![ query_tree: Some(Operation::Or(
Operation::And(vec![ false,
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), vec![Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), Operation::Query(Query {
Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), prefix: false,
]), kind: QueryKind::exact("split".to_string()),
])), }),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("this".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("world".to_string()),
}),
])],
)),
candidates: Some(&candidates_1 & &facet_candidates), candidates: Some(&candidates_1 & &facet_candidates),
bucket_candidates: Some(&candidates_1 & &facet_candidates), bucket_candidates: Some(&candidates_1 & &facet_candidates),
filtered_candidates: None, filtered_candidates: None,
@ -473,22 +549,37 @@ mod test {
assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1)); assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1));
let candidates_2 = ( let candidates_2 = (context.word_docids("split").unwrap().unwrap()
context.word_docids("split").unwrap().unwrap()
& context.word_docids("this").unwrap().unwrap() & context.word_docids("this").unwrap().unwrap()
& context.word_docids("word").unwrap().unwrap() & context.word_docids("word").unwrap().unwrap())
) - context.word_docids("world").unwrap().unwrap(); - context.word_docids("world").unwrap().unwrap();
let expected_2 = CriterionResult { let expected_2 = CriterionResult {
query_tree: Some(Operation::Or(false, vec![ query_tree: Some(Operation::Or(
Operation::And(vec![ false,
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), vec![Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), Operation::Query(Query {
Operation::Or(false, vec![ prefix: false,
Operation::Query(Query { prefix: false, kind: QueryKind::exact_with_typo(1, "word".to_string()) }), kind: QueryKind::exact("split".to_string()),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), }),
]), Operation::Query(Query {
]), prefix: false,
])), kind: QueryKind::exact("this".to_string()),
}),
Operation::Or(
false,
vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact_with_typo(1, "word".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("world".to_string()),
}),
],
),
])],
)),
candidates: Some(&candidates_2 & &facet_candidates), candidates: Some(&candidates_2 & &facet_candidates),
bucket_candidates: Some(&candidates_2 & &facet_candidates), bucket_candidates: Some(&candidates_2 & &facet_candidates),
filtered_candidates: None, filtered_candidates: None,

View File

@ -3,9 +3,9 @@ use std::mem::take;
use log::debug; use log::debug;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult};
use crate::search::query_tree::Operation; use crate::search::query_tree::Operation;
use crate::Result; use crate::Result;
use super::{Context, Criterion, CriterionParameters, CriterionResult, resolve_query_tree};
pub struct Words<'t> { pub struct Words<'t> {
ctx: &'t dyn Context<'t>, ctx: &'t dyn Context<'t>,
@ -44,11 +44,12 @@ impl<'t> Criterion for Words<'t> {
Some(query_tree) => { Some(query_tree) => {
let candidates = match self.candidates.as_mut() { let candidates = match self.candidates.as_mut() {
Some(allowed_candidates) => { Some(allowed_candidates) => {
let mut candidates = resolve_query_tree(self.ctx, &query_tree, params.wdcache)?; let mut candidates =
resolve_query_tree(self.ctx, &query_tree, params.wdcache)?;
candidates &= &*allowed_candidates; candidates &= &*allowed_candidates;
*allowed_candidates -= &candidates; *allowed_candidates -= &candidates;
Some(candidates) Some(candidates)
}, }
None => None, None => None,
}; };
@ -63,29 +64,38 @@ impl<'t> Criterion for Words<'t> {
filtered_candidates: self.filtered_candidates.clone(), filtered_candidates: self.filtered_candidates.clone(),
bucket_candidates, bucket_candidates,
})); }));
}, }
None => { None => match self.parent.next(params)? {
match self.parent.next(params)? { Some(CriterionResult {
Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => { query_tree: Some(query_tree),
candidates,
filtered_candidates,
bucket_candidates,
}) => {
self.query_trees = explode_query_tree(query_tree); self.query_trees = explode_query_tree(query_tree);
self.candidates = candidates; self.candidates = candidates;
self.filtered_candidates = filtered_candidates; self.filtered_candidates = filtered_candidates;
self.bucket_candidates = match (self.bucket_candidates.take(), bucket_candidates) { self.bucket_candidates =
match (self.bucket_candidates.take(), bucket_candidates) {
(Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc), (Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc),
(self_bc, parent_bc) => self_bc.or(parent_bc), (self_bc, parent_bc) => self_bc.or(parent_bc),
}; };
}, }
Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => { Some(CriterionResult {
query_tree: None,
candidates,
filtered_candidates,
bucket_candidates,
}) => {
return Ok(Some(CriterionResult { return Ok(Some(CriterionResult {
query_tree: None, query_tree: None,
candidates, candidates,
filtered_candidates, filtered_candidates,
bucket_candidates, bucket_candidates,
})); }));
},
None => return Ok(None),
} }
None => return Ok(None),
}, },
} }
} }

View File

@ -3,11 +3,11 @@ use std::mem::size_of;
use heed::types::ByteSlice; use heed::types::ByteSlice;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use super::{Distinct, DocIter};
use crate::error::InternalError; use crate::error::InternalError;
use crate::heed_codec::facet::*; use crate::heed_codec::facet::*;
use crate::index::db_name; use crate::index::db_name;
use crate::{DocumentId, FieldId, Index, Result}; use crate::{DocumentId, FieldId, Index, Result};
use super::{Distinct, DocIter};
const FID_SIZE: usize = size_of::<FieldId>(); const FID_SIZE: usize = size_of::<FieldId>();
const DOCID_SIZE: usize = size_of::<DocumentId>(); const DOCID_SIZE: usize = size_of::<DocumentId>();
@ -28,11 +28,7 @@ pub struct FacetDistinct<'a> {
impl<'a> FacetDistinct<'a> { impl<'a> FacetDistinct<'a> {
pub fn new(distinct: FieldId, index: &'a Index, txn: &'a heed::RoTxn<'a>) -> Self { pub fn new(distinct: FieldId, index: &'a Index, txn: &'a heed::RoTxn<'a>) -> Self {
Self { Self { distinct, index, txn }
distinct,
index,
txn,
}
} }
} }
@ -47,16 +43,12 @@ pub struct FacetDistinctIter<'a> {
impl<'a> FacetDistinctIter<'a> { impl<'a> FacetDistinctIter<'a> {
fn facet_string_docids(&self, key: &str) -> heed::Result<Option<RoaringBitmap>> { fn facet_string_docids(&self, key: &str) -> heed::Result<Option<RoaringBitmap>> {
self.index self.index.facet_id_string_docids.get(self.txn, &(self.distinct, key))
.facet_id_string_docids
.get(self.txn, &(self.distinct, key))
} }
fn facet_number_docids(&self, key: f64) -> heed::Result<Option<RoaringBitmap>> { fn facet_number_docids(&self, key: f64) -> heed::Result<Option<RoaringBitmap>> {
// get facet docids on level 0 // get facet docids on level 0
self.index self.index.facet_id_f64_docids.get(self.txn, &(self.distinct, 0, key, key))
.facet_id_f64_docids
.get(self.txn, &(self.distinct, 0, key, key))
} }
fn distinct_string(&mut self, id: DocumentId) -> Result<()> { fn distinct_string(&mut self, id: DocumentId) -> Result<()> {
@ -64,9 +56,8 @@ impl<'a> FacetDistinctIter<'a> {
for item in iter { for item in iter {
let ((_, _, value), _) = item?; let ((_, _, value), _) = item?;
let facet_docids = self let facet_docids =
.facet_string_docids(value)? self.facet_string_docids(value)?.ok_or(InternalError::DatabaseMissingEntry {
.ok_or(InternalError::DatabaseMissingEntry {
db_name: db_name::FACET_ID_STRING_DOCIDS, db_name: db_name::FACET_ID_STRING_DOCIDS,
key: None, key: None,
})?; })?;
@ -83,9 +74,8 @@ impl<'a> FacetDistinctIter<'a> {
for item in iter { for item in iter {
let ((_, _, value), _) = item?; let ((_, _, value), _) = item?;
let facet_docids = self let facet_docids =
.facet_number_docids(value)? self.facet_number_docids(value)?.ok_or(InternalError::DatabaseMissingEntry {
.ok_or(InternalError::DatabaseMissingEntry {
db_name: db_name::FACET_ID_F64_DOCIDS, db_name: db_name::FACET_ID_F64_DOCIDS,
key: None, key: None,
})?; })?;

View File

@ -1,11 +1,11 @@
mod facet_distinct; mod facet_distinct;
mod noop_distinct; mod noop_distinct;
pub use facet_distinct::FacetDistinct;
pub use noop_distinct::NoopDistinct;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use crate::{DocumentId, Result}; use crate::{DocumentId, Result};
pub use facet_distinct::FacetDistinct;
pub use noop_distinct::NoopDistinct;
/// A trait implemented by document interators that are returned by calls to `Distinct::distinct`. /// A trait implemented by document interators that are returned by calls to `Distinct::distinct`.
/// It provides a way to get back the ownership to the excluded set. /// It provides a way to get back the ownership to the excluded set.
@ -29,13 +29,15 @@ mod test {
use std::collections::HashSet; use std::collections::HashSet;
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use rand::{seq::SliceRandom, Rng}; use rand::seq::SliceRandom;
use rand::Rng;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use serde_json::{json, Value}; use serde_json::{json, Value};
use crate::index::{Index, tests::TempIndex}; use crate::index::tests::TempIndex;
use crate::index::Index;
use crate::update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat}; use crate::update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat};
use crate::{BEU32, FieldId, DocumentId}; use crate::{DocumentId, FieldId, BEU32};
static JSON: Lazy<Value> = Lazy::new(generate_json); static JSON: Lazy<Value> = Lazy::new(generate_json);
@ -89,9 +91,7 @@ mod test {
addition.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); addition.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
addition.update_format(UpdateFormat::Json); addition.update_format(UpdateFormat::Json);
addition addition.execute(JSON.to_string().as_bytes(), |_, _| ()).unwrap();
.execute(JSON.to_string().as_bytes(), |_, _| ())
.unwrap();
let fields_map = index.fields_ids_map(&txn).unwrap(); let fields_map = index.fields_ids_map(&txn).unwrap();
let fid = fields_map.id(&distinct).unwrap(); let fid = fields_map.id(&distinct).unwrap();
@ -103,7 +103,6 @@ mod test {
(index, fid, map) (index, fid, map)
} }
/// Checks that all the candidates are distinct, and returns the candidates number. /// Checks that all the candidates are distinct, and returns the candidates number.
pub(crate) fn validate_distinct_candidates( pub(crate) fn validate_distinct_candidates(
candidates: impl Iterator<Item = crate::Result<DocumentId>>, candidates: impl Iterator<Item = crate::Result<DocumentId>>,
@ -117,7 +116,7 @@ mod test {
let s = value.to_string(); let s = value.to_string();
assert!(seen.insert(s)); assert!(seen.insert(s));
} }
Value::Array(values) => {values.into_iter().for_each(|value| test(seen, value))} Value::Array(values) => values.into_iter().for_each(|value| test(seen, value)),
} }
} }

View File

@ -1,7 +1,8 @@
use roaring::{RoaringBitmap, bitmap::IntoIter}; use roaring::bitmap::IntoIter;
use roaring::RoaringBitmap;
use super::{Distinct, DocIter};
use crate::{DocumentId, Result}; use crate::{DocumentId, Result};
use super::{DocIter, Distinct};
/// A distinct implementer that does not perform any distinct, /// A distinct implementer that does not perform any distinct,
/// and simply returns an iterator to the candidates. /// and simply returns an iterator to the candidates.
@ -30,10 +31,7 @@ impl Distinct for NoopDistinct {
type Iter = NoopDistinctIter; type Iter = NoopDistinctIter;
fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter { fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter {
NoopDistinctIter { NoopDistinctIter { candidates: candidates.into_iter(), excluded }
candidates: candidates.into_iter(),
excluded,
}
} }
} }

View File

@ -1,16 +1,16 @@
use std::collections::{HashSet, BTreeMap}; use std::collections::{BTreeMap, HashSet};
use std::ops::Bound::Unbounded; use std::ops::Bound::Unbounded;
use std::{cmp, fmt}; use std::{cmp, fmt};
use heed::{Database, BytesDecode};
use heed::types::{ByteSlice, Unit}; use heed::types::{ByteSlice, Unit};
use heed::{BytesDecode, Database};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use crate::error::FieldIdMapMissingEntry; use crate::error::FieldIdMapMissingEntry;
use crate::facet::FacetType; use crate::facet::FacetType;
use crate::heed_codec::facet::FacetValueStringCodec; use crate::heed_codec::facet::FacetValueStringCodec;
use crate::search::facet::{FacetIter, FacetRange}; use crate::search::facet::{FacetIter, FacetRange};
use crate::{Index, FieldId, DocumentId, Result}; use crate::{DocumentId, FieldId, Index, Result};
/// The default number of values by facets that will /// The default number of values by facets that will
/// be fetched from the key-value store. /// be fetched from the key-value store.
@ -66,8 +66,7 @@ impl<'a> FacetDistribution<'a> {
facet_type: FacetType, facet_type: FacetType,
candidates: &RoaringBitmap, candidates: &RoaringBitmap,
distribution: &mut BTreeMap<String, u64>, distribution: &mut BTreeMap<String, u64>,
) -> heed::Result<()> ) -> heed::Result<()> {
{
fn fetch_facet_values<'t, KC, K: 't>( fn fetch_facet_values<'t, KC, K: 't>(
rtxn: &'t heed::RoTxn, rtxn: &'t heed::RoTxn,
db: Database<KC, Unit>, db: Database<KC, Unit>,
@ -102,7 +101,7 @@ impl<'a> FacetDistribution<'a> {
FacetType::Number => { FacetType::Number => {
let db = self.index.field_id_docid_facet_f64s; let db = self.index.field_id_docid_facet_f64s;
fetch_facet_values(self.rtxn, db, field_id, candidates, distribution) fetch_facet_values(self.rtxn, db, field_id, candidates, distribution)
}, }
FacetType::String => { FacetType::String => {
let db = self.index.field_id_docid_facet_strings; let db = self.index.field_id_docid_facet_strings;
fetch_facet_values(self.rtxn, db, field_id, candidates, distribution) fetch_facet_values(self.rtxn, db, field_id, candidates, distribution)
@ -117,11 +116,9 @@ impl<'a> FacetDistribution<'a> {
field_id: FieldId, field_id: FieldId,
candidates: &RoaringBitmap, candidates: &RoaringBitmap,
distribution: &mut BTreeMap<String, u64>, distribution: &mut BTreeMap<String, u64>,
) -> heed::Result<()> ) -> heed::Result<()> {
{ let iter =
let iter = FacetIter::new_non_reducing( FacetIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?;
self.rtxn, self.index, field_id, candidates.clone(),
)?;
for result in iter { for result in iter {
let (value, mut docids) = result?; let (value, mut docids) = result?;
@ -142,8 +139,7 @@ impl<'a> FacetDistribution<'a> {
fn facet_values_from_raw_facet_database( fn facet_values_from_raw_facet_database(
&self, &self,
field_id: FieldId, field_id: FieldId,
) -> heed::Result<BTreeMap<String, u64>> ) -> heed::Result<BTreeMap<String, u64>> {
{
let mut distribution = BTreeMap::new(); let mut distribution = BTreeMap::new();
let db = self.index.facet_id_f64_docids; let db = self.index.facet_id_f64_docids;
@ -157,7 +153,8 @@ impl<'a> FacetDistribution<'a> {
} }
} }
let iter = self.index let iter = self
.index
.facet_id_string_docids .facet_id_string_docids
.remap_key_type::<ByteSlice>() .remap_key_type::<ByteSlice>()
.prefix_iter(self.rtxn, &[field_id])? .prefix_iter(self.rtxn, &[field_id])?
@ -182,11 +179,30 @@ impl<'a> FacetDistribution<'a> {
// to those candidates. We also enter here for facet strings for performance reasons. // to those candidates. We also enter here for facet strings for performance reasons.
let mut distribution = BTreeMap::new(); let mut distribution = BTreeMap::new();
if candidates.len() <= CANDIDATES_THRESHOLD { if candidates.len() <= CANDIDATES_THRESHOLD {
self.facet_distribution_from_documents(field_id, Number, candidates, &mut distribution)?; self.facet_distribution_from_documents(
self.facet_distribution_from_documents(field_id, String, candidates, &mut distribution)?; field_id,
Number,
candidates,
&mut distribution,
)?;
self.facet_distribution_from_documents(
field_id,
String,
candidates,
&mut distribution,
)?;
} else { } else {
self.facet_numbers_distribution_from_facet_levels(field_id, candidates, &mut distribution)?; self.facet_numbers_distribution_from_facet_levels(
self.facet_distribution_from_documents(field_id, String, candidates, &mut distribution)?; field_id,
candidates,
&mut distribution,
)?;
self.facet_distribution_from_documents(
field_id,
String,
candidates,
&mut distribution,
)?;
} }
Ok(distribution) Ok(distribution)
@ -201,7 +217,8 @@ impl<'a> FacetDistribution<'a> {
let mut distribution = BTreeMap::new(); let mut distribution = BTreeMap::new();
for name in filterable_fields { for name in filterable_fields {
let fid = fields_ids_map.id(&name).ok_or_else(|| FieldIdMapMissingEntry::FieldName { let fid =
fields_ids_map.id(&name).ok_or_else(|| FieldIdMapMissingEntry::FieldName {
field_name: name.clone(), field_name: name.clone(),
process: "FacetDistribution::execute", process: "FacetDistribution::execute",
})?; })?;
@ -215,13 +232,7 @@ impl<'a> FacetDistribution<'a> {
impl fmt::Debug for FacetDistribution<'_> { impl fmt::Debug for FacetDistribution<'_> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let FacetDistribution { let FacetDistribution { facets, candidates, max_values_by_facet, rtxn: _, index: _ } = self;
facets,
candidates,
max_values_by_facet,
rtxn: _,
index: _,
} = self;
f.debug_struct("FacetDistribution") f.debug_struct("FacetDistribution")
.field("facets", facets) .field("facets", facets)

View File

@ -1,6 +1,6 @@
use std::collections::HashSet; use std::collections::HashSet;
use std::fmt::Debug; use std::fmt::Debug;
use std::ops::Bound::{self, Included, Excluded}; use std::ops::Bound::{self, Excluded, Included};
use std::result::Result as StdResult; use std::result::Result as StdResult;
use std::str::FromStr; use std::str::FromStr;
@ -12,16 +12,13 @@ use pest::iterators::{Pair, Pairs};
use pest::Parser; use pest::Parser;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use crate::error::UserError;
use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec};
use crate::{Index, FieldId, FieldsIdsMap, CboRoaringBitmapCodec, Result};
use super::FacetRange;
use super::parser::Rule;
use super::parser::{PREC_CLIMBER, FilterParser};
use self::FilterCondition::*; use self::FilterCondition::*;
use self::Operator::*; use self::Operator::*;
use super::parser::{FilterParser, Rule, PREC_CLIMBER};
use super::FacetRange;
use crate::error::UserError;
use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetValueStringCodec};
use crate::{CboRoaringBitmapCodec, FieldId, FieldsIdsMap, Index, Result};
#[derive(Debug, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq)]
pub enum Operator { pub enum Operator {
@ -63,7 +60,8 @@ impl FilterCondition {
index: &Index, index: &Index,
array: I, array: I,
) -> Result<Option<FilterCondition>> ) -> Result<Option<FilterCondition>>
where I: IntoIterator<Item=Either<J, B>>, where
I: IntoIterator<Item = Either<J, B>>,
J: IntoIterator<Item = A>, J: IntoIterator<Item = A>,
A: AsRef<str>, A: AsRef<str>,
B: AsRef<str>, B: AsRef<str>,
@ -88,7 +86,7 @@ impl FilterCondition {
None => Some(rule), None => Some(rule),
}; };
} }
}, }
Either::Right(rule) => { Either::Right(rule) => {
let condition = FilterCondition::from_str(rtxn, index, rule.as_ref())?; let condition = FilterCondition::from_str(rtxn, index, rule.as_ref())?;
ands = match ands.take() { ands = match ands.take() {
@ -106,11 +104,11 @@ impl FilterCondition {
rtxn: &heed::RoTxn, rtxn: &heed::RoTxn,
index: &Index, index: &Index,
expression: &str, expression: &str,
) -> Result<FilterCondition> ) -> Result<FilterCondition> {
{
let fields_ids_map = index.fields_ids_map(rtxn)?; let fields_ids_map = index.fields_ids_map(rtxn)?;
let filterable_fields = index.filterable_fields_ids(rtxn)?; let filterable_fields = index.filterable_fields_ids(rtxn)?;
let lexed = FilterParser::parse(Rule::prgm, expression).map_err(UserError::InvalidFilter)?; let lexed =
FilterParser::parse(Rule::prgm, expression).map_err(UserError::InvalidFilter)?;
FilterCondition::from_pairs(&fields_ids_map, &filterable_fields, lexed) FilterCondition::from_pairs(&fields_ids_map, &filterable_fields, lexed)
} }
@ -118,8 +116,7 @@ impl FilterCondition {
fim: &FieldsIdsMap, fim: &FieldsIdsMap,
ff: &HashSet<FieldId>, ff: &HashSet<FieldId>,
expression: Pairs<Rule>, expression: Pairs<Rule>,
) -> Result<Self> ) -> Result<Self> {
{
PREC_CLIMBER.climb( PREC_CLIMBER.climb(
expression, expression,
|pair: Pair<Rule>| match pair.as_rule() { |pair: Pair<Rule>| match pair.as_rule() {
@ -135,12 +132,10 @@ impl FilterCondition {
Rule::term => Self::from_pairs(fim, ff, pair.into_inner()), Rule::term => Self::from_pairs(fim, ff, pair.into_inner()),
_ => unreachable!(), _ => unreachable!(),
}, },
|lhs: Result<Self>, op: Pair<Rule>, rhs: Result<Self>| { |lhs: Result<Self>, op: Pair<Rule>, rhs: Result<Self>| match op.as_rule() {
match op.as_rule() {
Rule::or => Ok(Or(Box::new(lhs?), Box::new(rhs?))), Rule::or => Ok(Or(Box::new(lhs?), Box::new(rhs?))),
Rule::and => Ok(And(Box::new(lhs?), Box::new(rhs?))), Rule::and => Ok(And(Box::new(lhs?), Box::new(rhs?))),
_ => unreachable!(), _ => unreachable!(),
}
}, },
) )
} }
@ -160,8 +155,7 @@ impl FilterCondition {
fields_ids_map: &FieldsIdsMap, fields_ids_map: &FieldsIdsMap,
filterable_fields: &HashSet<FieldId>, filterable_fields: &HashSet<FieldId>,
item: Pair<Rule>, item: Pair<Rule>,
) -> Result<FilterCondition> ) -> Result<FilterCondition> {
{
let mut items = item.into_inner(); let mut items = item.into_inner();
let fid = field_id(fields_ids_map, filterable_fields, &mut items) let fid = field_id(fields_ids_map, filterable_fields, &mut items)
.map_err(UserError::InvalidFilterAttribute)?; .map_err(UserError::InvalidFilterAttribute)?;
@ -179,8 +173,7 @@ impl FilterCondition {
fields_ids_map: &FieldsIdsMap, fields_ids_map: &FieldsIdsMap,
filterable_fields: &HashSet<FieldId>, filterable_fields: &HashSet<FieldId>,
item: Pair<Rule>, item: Pair<Rule>,
) -> Result<FilterCondition> ) -> Result<FilterCondition> {
{
let mut items = item.into_inner(); let mut items = item.into_inner();
let fid = field_id(fields_ids_map, filterable_fields, &mut items) let fid = field_id(fields_ids_map, filterable_fields, &mut items)
.map_err(UserError::InvalidFilterAttribute)?; .map_err(UserError::InvalidFilterAttribute)?;
@ -196,8 +189,7 @@ impl FilterCondition {
fields_ids_map: &FieldsIdsMap, fields_ids_map: &FieldsIdsMap,
filterable_fields: &HashSet<FieldId>, filterable_fields: &HashSet<FieldId>,
item: Pair<Rule>, item: Pair<Rule>,
) -> Result<FilterCondition> ) -> Result<FilterCondition> {
{
let mut items = item.into_inner(); let mut items = item.into_inner();
let fid = field_id(fields_ids_map, filterable_fields, &mut items) let fid = field_id(fields_ids_map, filterable_fields, &mut items)
.map_err(UserError::InvalidFilterAttribute)?; .map_err(UserError::InvalidFilterAttribute)?;
@ -213,8 +205,7 @@ impl FilterCondition {
fields_ids_map: &FieldsIdsMap, fields_ids_map: &FieldsIdsMap,
filterable_fields: &HashSet<FieldId>, filterable_fields: &HashSet<FieldId>,
item: Pair<Rule>, item: Pair<Rule>,
) -> Result<FilterCondition> ) -> Result<FilterCondition> {
{
let mut items = item.into_inner(); let mut items = item.into_inner();
let fid = field_id(fields_ids_map, filterable_fields, &mut items) let fid = field_id(fields_ids_map, filterable_fields, &mut items)
.map_err(UserError::InvalidFilterAttribute)?; .map_err(UserError::InvalidFilterAttribute)?;
@ -230,8 +221,7 @@ impl FilterCondition {
fields_ids_map: &FieldsIdsMap, fields_ids_map: &FieldsIdsMap,
filterable_fields: &HashSet<FieldId>, filterable_fields: &HashSet<FieldId>,
item: Pair<Rule>, item: Pair<Rule>,
) -> Result<FilterCondition> ) -> Result<FilterCondition> {
{
let mut items = item.into_inner(); let mut items = item.into_inner();
let fid = field_id(fields_ids_map, filterable_fields, &mut items) let fid = field_id(fields_ids_map, filterable_fields, &mut items)
.map_err(UserError::InvalidFilterAttribute)?; .map_err(UserError::InvalidFilterAttribute)?;
@ -247,8 +237,7 @@ impl FilterCondition {
fields_ids_map: &FieldsIdsMap, fields_ids_map: &FieldsIdsMap,
filterable_fields: &HashSet<FieldId>, filterable_fields: &HashSet<FieldId>,
item: Pair<Rule>, item: Pair<Rule>,
) -> Result<FilterCondition> ) -> Result<FilterCondition> {
{
let mut items = item.into_inner(); let mut items = item.into_inner();
let fid = field_id(fields_ids_map, filterable_fields, &mut items) let fid = field_id(fields_ids_map, filterable_fields, &mut items)
.map_err(UserError::InvalidFilterAttribute)?; .map_err(UserError::InvalidFilterAttribute)?;
@ -272,13 +261,14 @@ impl FilterCondition {
left: Bound<f64>, left: Bound<f64>,
right: Bound<f64>, right: Bound<f64>,
output: &mut RoaringBitmap, output: &mut RoaringBitmap,
) -> Result<()> ) -> Result<()> {
{
match (left, right) { match (left, right) {
// If the request is an exact value we must go directly to the deepest level. // If the request is an exact value we must go directly to the deepest level.
(Included(l), Included(r)) if l == r && level > 0 => { (Included(l), Included(r)) if l == r && level > 0 => {
return Self::explore_facet_number_levels(rtxn, db, field_id, 0, left, right, output); return Self::explore_facet_number_levels(
}, rtxn, db, field_id, 0, left, right, output,
);
}
// lower TO upper when lower > upper must return no result // lower TO upper when lower > upper must return no result
(Included(l), Included(r)) if l > r => return Ok(()), (Included(l), Included(r)) if l > r => return Ok(()),
(Included(l), Excluded(r)) if l >= r => return Ok(()), (Included(l), Excluded(r)) if l >= r => return Ok(()),
@ -301,7 +291,9 @@ impl FilterCondition {
debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len()); debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len());
output.union_with(&docids); output.union_with(&docids);
// We save the leftest and rightest bounds we actually found at this level. // We save the leftest and rightest bounds we actually found at this level.
if i == 0 { left_found = Some(l); } if i == 0 {
left_found = Some(l);
}
right_found = Some(r); right_found = Some(r);
} }
@ -318,20 +310,50 @@ impl FilterCondition {
// If the bound is satisfied we avoid calling this function again. // If the bound is satisfied we avoid calling this function again.
if !matches!(left, Included(l) if l == left_found) { if !matches!(left, Included(l) if l == left_found) {
let sub_right = Excluded(left_found); let sub_right = Excluded(left_found);
debug!("calling left with {:?} to {:?} (level {})", left, sub_right, deeper_level); debug!(
Self::explore_facet_number_levels(rtxn, db, field_id, deeper_level, left, sub_right, output)?; "calling left with {:?} to {:?} (level {})",
left, sub_right, deeper_level
);
Self::explore_facet_number_levels(
rtxn,
db,
field_id,
deeper_level,
left,
sub_right,
output,
)?;
} }
if !matches!(right, Included(r) if r == right_found) { if !matches!(right, Included(r) if r == right_found) {
let sub_left = Excluded(right_found); let sub_left = Excluded(right_found);
debug!("calling right with {:?} to {:?} (level {})", sub_left, right, deeper_level); debug!(
Self::explore_facet_number_levels(rtxn, db, field_id, deeper_level, sub_left, right, output)?; "calling right with {:?} to {:?} (level {})",
sub_left, right, deeper_level
);
Self::explore_facet_number_levels(
rtxn,
db,
field_id,
deeper_level,
sub_left,
right,
output,
)?;
}
} }
},
None => { None => {
// If we found nothing at this level it means that we must find // If we found nothing at this level it means that we must find
// the same bounds but at a deeper, more precise level. // the same bounds but at a deeper, more precise level.
Self::explore_facet_number_levels(rtxn, db, field_id, deeper_level, left, right, output)?; Self::explore_facet_number_levels(
}, rtxn,
db,
field_id,
deeper_level,
left,
right,
output,
)?;
}
} }
Ok(()) Ok(())
@ -344,8 +366,7 @@ impl FilterCondition {
strings_db: heed::Database<FacetValueStringCodec, CboRoaringBitmapCodec>, strings_db: heed::Database<FacetValueStringCodec, CboRoaringBitmapCodec>,
field_id: FieldId, field_id: FieldId,
operator: &Operator, operator: &Operator,
) -> Result<RoaringBitmap> ) -> Result<RoaringBitmap> {
{
// Make sure we always bound the ranges with the field id and the level, // Make sure we always bound the ranges with the field id and the level,
// as the facets values are all in the same database and prefixed by the // as the facets values are all in the same database and prefixed by the
// field id and the level. // field id and the level.
@ -358,13 +379,21 @@ impl FilterCondition {
Some(n) => { Some(n) => {
let n = Included(*n); let n = Included(*n);
let mut output = RoaringBitmap::new(); let mut output = RoaringBitmap::new();
Self::explore_facet_number_levels(rtxn, numbers_db, field_id, 0, n, n, &mut output)?; Self::explore_facet_number_levels(
rtxn,
numbers_db,
field_id,
0,
n,
n,
&mut output,
)?;
output output
}, }
None => RoaringBitmap::new(), None => RoaringBitmap::new(),
}; };
return Ok(string_docids | number_docids); return Ok(string_docids | number_docids);
}, }
NotEqual(number, string) => { NotEqual(number, string) => {
let all_numbers_ids = if number.is_some() { let all_numbers_ids = if number.is_some() {
index.number_faceted_documents_ids(rtxn, field_id)? index.number_faceted_documents_ids(rtxn, field_id)?
@ -373,9 +402,11 @@ impl FilterCondition {
}; };
let all_strings_ids = index.string_faceted_documents_ids(rtxn, field_id)?; let all_strings_ids = index.string_faceted_documents_ids(rtxn, field_id)?;
let operator = Equal(*number, string.clone()); let operator = Equal(*number, string.clone());
let docids = Self::evaluate_operator(rtxn, index, numbers_db, strings_db, field_id, &operator)?; let docids = Self::evaluate_operator(
rtxn, index, numbers_db, strings_db, field_id, &operator,
)?;
return Ok((all_numbers_ids | all_strings_ids) - docids); return Ok((all_numbers_ids | all_strings_ids) - docids);
}, }
LowerThan(val) => (Included(f64::MIN), Excluded(*val)), LowerThan(val) => (Included(f64::MIN), Excluded(*val)),
LowerThanOrEqual(val) => (Included(f64::MIN), Included(*val)), LowerThanOrEqual(val) => (Included(f64::MIN), Included(*val)),
Between(left, right) => (Included(*left), Included(*right)), Between(left, right) => (Included(*left), Included(*right)),
@ -391,36 +422,39 @@ impl FilterCondition {
match biggest_level { match biggest_level {
Some(level) => { Some(level) => {
let mut output = RoaringBitmap::new(); let mut output = RoaringBitmap::new();
Self::explore_facet_number_levels(rtxn, numbers_db, field_id, level, left, right, &mut output)?; Self::explore_facet_number_levels(
rtxn,
numbers_db,
field_id,
level,
left,
right,
&mut output,
)?;
Ok(output) Ok(output)
}, }
None => Ok(RoaringBitmap::new()), None => Ok(RoaringBitmap::new()),
} }
} }
pub fn evaluate( pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result<RoaringBitmap> {
&self,
rtxn: &heed::RoTxn,
index: &Index,
) -> Result<RoaringBitmap>
{
let numbers_db = index.facet_id_f64_docids; let numbers_db = index.facet_id_f64_docids;
let strings_db = index.facet_id_string_docids; let strings_db = index.facet_id_string_docids;
match self { match self {
Operator(fid, op) => { Operator(fid, op) => {
Self::evaluate_operator(rtxn, index, numbers_db, strings_db, *fid, op) Self::evaluate_operator(rtxn, index, numbers_db, strings_db, *fid, op)
}, }
Or(lhs, rhs) => { Or(lhs, rhs) => {
let lhs = lhs.evaluate(rtxn, index)?; let lhs = lhs.evaluate(rtxn, index)?;
let rhs = rhs.evaluate(rtxn, index)?; let rhs = rhs.evaluate(rtxn, index)?;
Ok(lhs | rhs) Ok(lhs | rhs)
}, }
And(lhs, rhs) => { And(lhs, rhs) => {
let lhs = lhs.evaluate(rtxn, index)?; let lhs = lhs.evaluate(rtxn, index)?;
let rhs = rhs.evaluate(rtxn, index)?; let rhs = rhs.evaluate(rtxn, index)?;
Ok(lhs & rhs) Ok(lhs & rhs)
}, }
} }
} }
} }
@ -434,14 +468,14 @@ fn field_id(
fields_ids_map: &FieldsIdsMap, fields_ids_map: &FieldsIdsMap,
filterable_fields: &HashSet<FieldId>, filterable_fields: &HashSet<FieldId>,
items: &mut Pairs<Rule>, items: &mut Pairs<Rule>,
) -> StdResult<FieldId, PestError<Rule>> ) -> StdResult<FieldId, PestError<Rule>> {
{
// lexing ensures that we at least have a key // lexing ensures that we at least have a key
let key = items.next().unwrap(); let key = items.next().unwrap();
let field_id = match fields_ids_map.id(key.as_str()) { let field_id = match fields_ids_map.id(key.as_str()) {
Some(field_id) => field_id, Some(field_id) => field_id,
None => return Err(PestError::new_from_span( None => {
return Err(PestError::new_from_span(
ErrorVariant::CustomError { ErrorVariant::CustomError {
message: format!( message: format!(
"attribute `{}` not found, available attributes are: {}", "attribute `{}` not found, available attributes are: {}",
@ -450,7 +484,8 @@ fn field_id(
), ),
}, },
key.as_span(), key.as_span(),
)), ))
}
}; };
if !filterable_fields.contains(&field_id) { if !filterable_fields.contains(&field_id) {
@ -459,9 +494,11 @@ fn field_id(
message: format!( message: format!(
"attribute `{}` is not filterable, available filterable attributes are: {}", "attribute `{}` is not filterable, available filterable attributes are: {}",
key.as_str(), key.as_str(),
filterable_fields.iter().flat_map(|id| { filterable_fields
fields_ids_map.name(*id) .iter()
}).collect::<Vec<_>>().join(", "), .flat_map(|id| { fields_ids_map.name(*id) })
.collect::<Vec<_>>()
.join(", "),
), ),
}, },
key.as_span(), key.as_span(),
@ -476,7 +513,8 @@ fn field_id(
/// ///
/// Returns the parsing error associated with the span if the conversion fails. /// Returns the parsing error associated with the span if the conversion fails.
fn pest_parse<T>(pair: Pair<Rule>) -> (StdResult<T, pest::error::Error<Rule>>, String) fn pest_parse<T>(pair: Pair<Rule>) -> (StdResult<T, pest::error::Error<Rule>>, String)
where T: FromStr, where
T: FromStr,
T::Err: ToString, T::Err: ToString,
{ {
let result = match pair.as_str().parse::<T>() { let result = match pair.as_str().parse::<T>() {
@ -492,11 +530,12 @@ where T: FromStr,
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use big_s::S;
use crate::update::Settings;
use heed::EnvOpenOptions; use heed::EnvOpenOptions;
use maplit::hashset; use maplit::hashset;
use big_s::S;
use super::*;
use crate::update::Settings;
#[test] #[test]
fn string() { fn string() {
@ -548,10 +587,8 @@ mod tests {
assert_eq!(condition, expected); assert_eq!(condition, expected);
let condition = FilterCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap(); let condition = FilterCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap();
let expected = Or( let expected =
Box::new(Operator(0, LowerThan(22.0))), Or(Box::new(Operator(0, LowerThan(22.0))), Box::new(Operator(0, GreaterThan(44.0))));
Box::new(Operator(0, GreaterThan(44.0))),
);
assert_eq!(condition, expected); assert_eq!(condition, expected);
} }
@ -573,22 +610,26 @@ mod tests {
// Test that the facet condition is correctly generated. // Test that the facet condition is correctly generated.
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
let condition = FilterCondition::from_str( let condition = FilterCondition::from_str(
&rtxn, &index, &rtxn,
&index,
"channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)", "channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)",
).unwrap(); )
.unwrap();
let expected = Or( let expected = Or(
Box::new(Operator(0, Operator::Equal(None, S("gotaga")))), Box::new(Operator(0, Operator::Equal(None, S("gotaga")))),
Box::new(And( Box::new(And(
Box::new(Operator(1, Between(22.0, 44.0))), Box::new(Operator(1, Between(22.0, 44.0))),
Box::new(Operator(0, Operator::NotEqual(None, S("ponce")))), Box::new(Operator(0, Operator::NotEqual(None, S("ponce")))),
)) )),
); );
assert_eq!(condition, expected); assert_eq!(condition, expected);
let condition = FilterCondition::from_str( let condition = FilterCondition::from_str(
&rtxn, &index, &rtxn,
&index,
"channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)", "channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)",
).unwrap(); )
.unwrap();
let expected = Or( let expected = Or(
Box::new(Operator(0, Operator::Equal(None, S("gotaga")))), Box::new(Operator(0, Operator::Equal(None, S("gotaga")))),
Box::new(Or( Box::new(Or(
@ -620,13 +661,21 @@ mod tests {
// Test that the facet condition is correctly generated. // Test that the facet condition is correctly generated.
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
let condition = FilterCondition::from_array( let condition = FilterCondition::from_array(
&rtxn, &index, &rtxn,
vec![Either::Right("channel = gotaga"), Either::Left(vec!["timestamp = 44", "channel != ponce"])], &index,
).unwrap().unwrap(); vec![
Either::Right("channel = gotaga"),
Either::Left(vec!["timestamp = 44", "channel != ponce"]),
],
)
.unwrap()
.unwrap();
let expected = FilterCondition::from_str( let expected = FilterCondition::from_str(
&rtxn, &index, &rtxn,
&index,
"channel = gotaga AND (timestamp = 44 OR channel != ponce)", "channel = gotaga AND (timestamp = 44 OR channel != ponce)",
).unwrap(); )
.unwrap();
assert_eq!(condition, expected); assert_eq!(condition, expected);
} }
} }

View File

@ -1,20 +1,19 @@
use std::ops::Bound::{self, Included, Excluded, Unbounded}; use std::ops::Bound::{self, Excluded, Included, Unbounded};
use either::Either::{self, Left, Right}; use either::Either::{self, Left, Right};
use heed::types::{DecodeIgnore, ByteSlice}; use heed::types::{ByteSlice, DecodeIgnore};
use heed::{Database, RoRange, RoRevRange, LazyDecode}; use heed::{Database, LazyDecode, RoRange, RoRevRange};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use crate::heed_codec::CboRoaringBitmapCodec;
use crate::heed_codec::facet::FacetLevelValueF64Codec;
use crate::{Index, FieldId};
pub use self::facet_distribution::FacetDistribution; pub use self::facet_distribution::FacetDistribution;
pub use self::filter_condition::{FilterCondition, Operator}; pub use self::filter_condition::{FilterCondition, Operator};
pub(crate) use self::parser::Rule as ParserRule; pub(crate) use self::parser::Rule as ParserRule;
use crate::heed_codec::facet::FacetLevelValueF64Codec;
use crate::heed_codec::CboRoaringBitmapCodec;
use crate::{FieldId, Index};
mod filter_condition;
mod facet_distribution; mod facet_distribution;
mod filter_condition;
mod parser; mod parser;
pub struct FacetRange<'t> { pub struct FacetRange<'t> {
@ -30,8 +29,7 @@ impl<'t> FacetRange<'t> {
level: u8, level: u8,
left: Bound<f64>, left: Bound<f64>,
right: Bound<f64>, right: Bound<f64>,
) -> heed::Result<FacetRange<'t>> ) -> heed::Result<FacetRange<'t>> {
{
let left_bound = match left { let left_bound = match left {
Included(left) => Included((field_id, level, left, f64::MIN)), Included(left) => Included((field_id, level, left, f64::MIN)),
Excluded(left) => Excluded((field_id, level, left, f64::MIN)), Excluded(left) => Excluded((field_id, level, left, f64::MIN)),
@ -62,7 +60,7 @@ impl<'t> Iterator for FacetRange<'t> {
} else { } else {
None None
} }
}, }
Some(Err(e)) => Some(Err(e)), Some(Err(e)) => Some(Err(e)),
None => None, None => None,
} }
@ -82,8 +80,7 @@ impl<'t> FacetRevRange<'t> {
level: u8, level: u8,
left: Bound<f64>, left: Bound<f64>,
right: Bound<f64>, right: Bound<f64>,
) -> heed::Result<FacetRevRange<'t>> ) -> heed::Result<FacetRevRange<'t>> {
{
let left_bound = match left { let left_bound = match left {
Included(left) => Included((field_id, level, left, f64::MIN)), Included(left) => Included((field_id, level, left, f64::MIN)),
Excluded(left) => Excluded((field_id, level, left, f64::MIN)), Excluded(left) => Excluded((field_id, level, left, f64::MIN)),
@ -114,7 +111,7 @@ impl<'t> Iterator for FacetRevRange<'t> {
} }
} }
continue; continue;
}, }
Some(Err(e)) => return Some(Err(e)), Some(Err(e)) => return Some(Err(e)),
None => return None, None => return None,
} }
@ -139,11 +136,11 @@ impl<'t> FacetIter<'t> {
index: &'t Index, index: &'t Index,
field_id: FieldId, field_id: FieldId,
documents_ids: RoaringBitmap, documents_ids: RoaringBitmap,
) -> heed::Result<FacetIter<'t>> ) -> heed::Result<FacetIter<'t>> {
{
let db = index.facet_id_f64_docids.remap_key_type::<FacetLevelValueF64Codec>(); let db = index.facet_id_f64_docids.remap_key_type::<FacetLevelValueF64Codec>();
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; let highest_iter =
FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
let level_iters = vec![(documents_ids, Left(highest_iter))]; let level_iters = vec![(documents_ids, Left(highest_iter))];
Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: true }) Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: true })
} }
@ -156,11 +153,11 @@ impl<'t> FacetIter<'t> {
index: &'t Index, index: &'t Index,
field_id: FieldId, field_id: FieldId,
documents_ids: RoaringBitmap, documents_ids: RoaringBitmap,
) -> heed::Result<FacetIter<'t>> ) -> heed::Result<FacetIter<'t>> {
{
let db = index.facet_id_f64_docids.remap_key_type::<FacetLevelValueF64Codec>(); let db = index.facet_id_f64_docids.remap_key_type::<FacetLevelValueF64Codec>();
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
let highest_iter = FacetRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; let highest_iter =
FacetRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
let level_iters = vec![(documents_ids, Right(highest_iter))]; let level_iters = vec![(documents_ids, Right(highest_iter))];
Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: true }) Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: true })
} }
@ -174,11 +171,11 @@ impl<'t> FacetIter<'t> {
index: &'t Index, index: &'t Index,
field_id: FieldId, field_id: FieldId,
documents_ids: RoaringBitmap, documents_ids: RoaringBitmap,
) -> heed::Result<FacetIter<'t>> ) -> heed::Result<FacetIter<'t>> {
{
let db = index.facet_id_f64_docids.remap_key_type::<FacetLevelValueF64Codec>(); let db = index.facet_id_f64_docids.remap_key_type::<FacetLevelValueF64Codec>();
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; let highest_iter =
FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
let level_iters = vec![(documents_ids, Left(highest_iter))]; let level_iters = vec![(documents_ids, Left(highest_iter))];
Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: false }) Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: false })
} }
@ -187,12 +184,13 @@ impl<'t> FacetIter<'t> {
rtxn: &'t heed::RoTxn, rtxn: &'t heed::RoTxn,
db: Database<FacetLevelValueF64Codec, X>, db: Database<FacetLevelValueF64Codec, X>,
fid: FieldId, fid: FieldId,
) -> heed::Result<Option<u8>> ) -> heed::Result<Option<u8>> {
{ let level = db
let level = db.remap_types::<ByteSlice, DecodeIgnore>() .remap_types::<ByteSlice, DecodeIgnore>()
.prefix_iter(rtxn, &[fid][..])? .prefix_iter(rtxn, &[fid][..])?
.remap_key_type::<FacetLevelValueF64Codec>() .remap_key_type::<FacetLevelValueF64Codec>()
.last().transpose()? .last()
.transpose()?
.map(|((_, level, _, _), _)| level); .map(|((_, level, _, _), _)| level);
Ok(level) Ok(level)
} }
@ -215,7 +213,6 @@ impl<'t> Iterator for FacetIter<'t> {
match result { match result {
Ok(((_fid, level, left, right), mut docids)) => { Ok(((_fid, level, left, right), mut docids)) => {
docids.intersect_with(&documents_ids); docids.intersect_with(&documents_ids);
if !docids.is_empty() { if !docids.is_empty() {
if self.must_reduce { if self.must_reduce {
@ -242,11 +239,11 @@ impl<'t> Iterator for FacetIter<'t> {
Ok(iter) => { Ok(iter) => {
self.level_iters.push((docids, iter)); self.level_iters.push((docids, iter));
continue 'outer; continue 'outer;
}, }
Err(e) => return Some(Err(e)), Err(e) => return Some(Err(e)),
} }
} }
}, }
Err(e) => return Some(Err(e)), Err(e) => return Some(Err(e)),
} }
} }

View File

@ -1,5 +1,5 @@
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use pest::prec_climber::{Operator, Assoc, PrecClimber}; use pest::prec_climber::{Assoc, Operator, PrecClimber};
pub static PREC_CLIMBER: Lazy<PrecClimber<Rule>> = Lazy::new(|| { pub static PREC_CLIMBER: Lazy<PrecClimber<Rule>> = Lazy::new(|| {
use Assoc::*; use Assoc::*;

View File

@ -1,13 +1,11 @@
use std::collections::HashSet;
use std::cmp::{min, Reverse}; use std::cmp::{min, Reverse};
use std::collections::BTreeMap; use std::collections::{BTreeMap, HashSet};
use std::ops::{Index, IndexMut}; use std::ops::{Index, IndexMut};
use levenshtein_automata::{DFA, Distance}; use levenshtein_automata::{Distance, DFA};
use crate::search::query_tree::{Operation, Query};
use super::build_dfa; use super::build_dfa;
use crate::search::query_tree::{Operation, Query};
type IsPrefix = bool; type IsPrefix = bool;
@ -28,7 +26,9 @@ impl MatchingWords {
.collect(); .collect();
// Sort word by len in DESC order prioritizing the longuest word, // Sort word by len in DESC order prioritizing the longuest word,
// in order to highlight the longuest part of the matched word. // in order to highlight the longuest part of the matched word.
dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix)| Reverse(query_word.len())); dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix)| {
Reverse(query_word.len())
});
Self { dfas } Self { dfas }
} }
@ -37,12 +37,13 @@ impl MatchingWords {
self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| match dfa.eval(word) { self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| match dfa.eval(word) {
Distance::Exact(t) if t <= *typo => { Distance::Exact(t) if t <= *typo => {
if *is_prefix { if *is_prefix {
let (_dist, len) = prefix_damerau_levenshtein(query_word.as_bytes(), word.as_bytes()); let (_dist, len) =
prefix_damerau_levenshtein(query_word.as_bytes(), word.as_bytes());
Some(len) Some(len)
} else { } else {
Some(word.len()) Some(word.len())
} }
}, }
_otherwise => None, _otherwise => None,
}) })
} }
@ -54,11 +55,11 @@ fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> {
match tree { match tree {
Operation::Or(_, ops) | Operation::And(ops) => { Operation::Or(_, ops) | Operation::And(ops) => {
ops.as_slice().iter().for_each(|op| resolve_ops(op, out)); ops.as_slice().iter().for_each(|op| resolve_ops(op, out));
}, }
Operation::Query(Query { prefix, kind }) => { Operation::Query(Query { prefix, kind }) => {
let typo = if kind.is_exact() { 0 } else { kind.typo() }; let typo = if kind.is_exact() { 0 } else { kind.typo() };
out.insert((kind.word(), typo, *prefix)); out.insert((kind.word(), typo, *prefix));
}, }
Operation::Phrase(words) => { Operation::Phrase(words) => {
for word in words { for word in words {
out.insert((word, 0, false)); out.insert((word, 0, false));
@ -80,10 +81,7 @@ struct N2Array<T> {
impl<T: Clone> N2Array<T> { impl<T: Clone> N2Array<T> {
fn new(x: usize, y: usize, value: T) -> N2Array<T> { fn new(x: usize, y: usize, value: T) -> N2Array<T> {
N2Array { N2Array { y_size: y, buf: vec![value; x * y] }
y_size: y,
buf: vec![value; x * y],
}
} }
} }
@ -178,9 +176,8 @@ fn prefix_damerau_levenshtein(source: &[u8], target: &[u8]) -> (u32, usize) {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::MatchingWords;
use crate::search::query_tree::{Operation, Query, QueryKind}; use crate::search::query_tree::{Operation, Query, QueryKind};
use crate::MatchingWords;
#[test] #[test]
fn matched_length() { fn matched_length() {
@ -194,13 +191,23 @@ mod tests {
#[test] #[test]
fn matching_words() { fn matching_words() {
let query_tree = Operation::Or(false, vec![ let query_tree = Operation::Or(
Operation::And(vec![ false,
Operation::Query(Query { prefix: true, kind: QueryKind::exact("split".to_string()) }), vec![Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), Operation::Query(Query {
Operation::Query(Query { prefix: true, kind: QueryKind::tolerant(1, "world".to_string()) }), prefix: true,
]), kind: QueryKind::exact("split".to_string()),
]); }),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("this".to_string()),
}),
Operation::Query(Query {
prefix: true,
kind: QueryKind::tolerant(1, "world".to_string()),
}),
])],
);
let matching_words = MatchingWords::from_query_tree(&query_tree); let matching_words = MatchingWords::from_query_tree(&query_tree);

View File

@ -6,6 +6,7 @@ use std::result::Result as StdResult;
use std::str::Utf8Error; use std::str::Utf8Error;
use std::time::Instant; use std::time::Instant;
use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct};
use fst::{IntoStreamer, Streamer}; use fst::{IntoStreamer, Streamer};
use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA}; use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA};
use log::debug; use log::debug;
@ -13,16 +14,13 @@ use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use roaring::bitmap::RoaringBitmap; use roaring::bitmap::RoaringBitmap;
pub(crate) use self::facet::ParserRule;
pub use self::facet::{FacetDistribution, FacetIter, FilterCondition, Operator};
pub use self::matching_words::MatchingWords;
use self::query_tree::QueryTreeBuilder;
use crate::error::FieldIdMapMissingEntry; use crate::error::FieldIdMapMissingEntry;
use crate::search::criteria::r#final::{Final, FinalResult}; use crate::search::criteria::r#final::{Final, FinalResult};
use crate::{Index, DocumentId, Result}; use crate::{DocumentId, Index, Result};
pub use self::facet::{FilterCondition, FacetDistribution, FacetIter, Operator};
pub use self::matching_words::MatchingWords;
pub(crate) use self::facet::ParserRule;
use self::query_tree::QueryTreeBuilder;
use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct};
// Building these factories is not free. // Building these factories is not free.
static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true)); static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true));
@ -32,8 +30,8 @@ static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true));
mod criteria; mod criteria;
mod distinct; mod distinct;
mod facet; mod facet;
mod query_tree;
mod matching_words; mod matching_words;
mod query_tree;
pub struct Search<'a> { pub struct Search<'a> {
query: Option<String>, query: Option<String>,
@ -117,7 +115,7 @@ impl<'a> Search<'a> {
let result = analyzer.analyze(query); let result = analyzer.analyze(query);
let tokens = result.tokens(); let tokens = result.tokens();
builder.build(tokens)?.map_or((None, None), |(qt, pq)| (Some(qt), Some(pq))) builder.build(tokens)?.map_or((None, None), |(qt, pq)| (Some(qt), Some(pq)))
}, }
None => (None, None), None => (None, None),
}; };
@ -144,7 +142,8 @@ impl<'a> Search<'a> {
None => self.perform_sort(NoopDistinct, matching_words, criteria), None => self.perform_sort(NoopDistinct, matching_words, criteria),
Some(name) => { Some(name) => {
let field_ids_map = self.index.fields_ids_map(self.rtxn)?; let field_ids_map = self.index.fields_ids_map(self.rtxn)?;
let id = field_ids_map.id(name).ok_or_else(|| FieldIdMapMissingEntry::FieldName { let id =
field_ids_map.id(name).ok_or_else(|| FieldIdMapMissingEntry::FieldName {
field_name: name.to_string(), field_name: name.to_string(),
process: "distinct attribute", process: "distinct attribute",
})?; })?;
@ -159,14 +158,15 @@ impl<'a> Search<'a> {
mut distinct: D, mut distinct: D,
matching_words: MatchingWords, matching_words: MatchingWords,
mut criteria: Final, mut criteria: Final,
) -> Result<SearchResult> ) -> Result<SearchResult> {
{
let mut offset = self.offset; let mut offset = self.offset;
let mut initial_candidates = RoaringBitmap::new(); let mut initial_candidates = RoaringBitmap::new();
let mut excluded_candidates = RoaringBitmap::new(); let mut excluded_candidates = RoaringBitmap::new();
let mut documents_ids = Vec::with_capacity(self.limit); let mut documents_ids = Vec::with_capacity(self.limit);
while let Some(FinalResult { candidates, bucket_candidates, .. }) = criteria.next(&excluded_candidates)? { while let Some(FinalResult { candidates, bucket_candidates, .. }) =
criteria.next(&excluded_candidates)?
{
debug!("Number of candidates found {}", candidates.len()); debug!("Number of candidates found {}", candidates.len());
let excluded = take(&mut excluded_candidates); let excluded = take(&mut excluded_candidates);
@ -183,7 +183,9 @@ impl<'a> Search<'a> {
for candidate in candidates.by_ref().take(self.limit - documents_ids.len()) { for candidate in candidates.by_ref().take(self.limit - documents_ids.len()) {
documents_ids.push(candidate?); documents_ids.push(candidate?);
} }
if documents_ids.len() == self.limit { break } if documents_ids.len() == self.limit {
break;
}
excluded_candidates = candidates.into_excluded(); excluded_candidates = candidates.into_excluded();
} }
@ -247,7 +249,7 @@ pub fn word_derivations<'c>(
} }
Ok(entry.insert(derived_words)) Ok(entry.insert(derived_words))
}, }
} }
} }

View File

@ -1,4 +1,4 @@
use std::{fmt, cmp, mem}; use std::{cmp, fmt, mem};
use fst::Set; use fst::Set;
use meilisearch_tokenizer::token::SeparatorKind; use meilisearch_tokenizer::token::SeparatorKind;
@ -28,18 +28,18 @@ impl fmt::Debug for Operation {
Operation::And(children) => { Operation::And(children) => {
writeln!(f, "{:1$}AND", "", depth * 2)?; writeln!(f, "{:1$}AND", "", depth * 2)?;
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1)) children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
}, }
Operation::Phrase(children) => { Operation::Phrase(children) => {
writeln!(f, "{:2$}PHRASE {:?}", "", children, depth * 2) writeln!(f, "{:2$}PHRASE {:?}", "", children, depth * 2)
}, }
Operation::Or(true, children) => { Operation::Or(true, children) => {
writeln!(f, "{:1$}OR(WORD)", "", depth * 2)?; writeln!(f, "{:1$}OR(WORD)", "", depth * 2)?;
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1)) children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
}, }
Operation::Or(false, children) => { Operation::Or(false, children) => {
writeln!(f, "{:1$}OR", "", depth * 2)?; writeln!(f, "{:1$}OR", "", depth * 2)?;
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1)) children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
}, }
Operation::Query(query) => writeln!(f, "{:2$}{:?}", "", query, depth * 2), Operation::Query(query) => writeln!(f, "{:2$}{:?}", "", query, depth * 2),
} }
} }
@ -136,10 +136,12 @@ impl fmt::Debug for Query {
match kind { match kind {
QueryKind::Exact { word, .. } => { QueryKind::Exact { word, .. } => {
f.debug_struct(&(prefix + "Exact")).field("word", &word).finish() f.debug_struct(&(prefix + "Exact")).field("word", &word).finish()
}, }
QueryKind::Tolerant { typo, word } => { QueryKind::Tolerant { typo, word } => f
f.debug_struct(&(prefix + "Tolerant")).field("word", &word).field("max typo", &typo).finish() .debug_struct(&(prefix + "Tolerant"))
}, .field("word", &word)
.field("max typo", &typo)
.finish(),
} }
} }
} }
@ -223,7 +225,12 @@ impl<'a> QueryTreeBuilder<'a> {
let stop_words = self.index.stop_words(self.rtxn)?; let stop_words = self.index.stop_words(self.rtxn)?;
let primitive_query = create_primitive_query(query, stop_words, self.words_limit); let primitive_query = create_primitive_query(query, stop_words, self.words_limit);
if !primitive_query.is_empty() { if !primitive_query.is_empty() {
let qt = create_query_tree(self, self.optional_words, self.authorize_typos, &primitive_query)?; let qt = create_query_tree(
self,
self.optional_words,
self.authorize_typos,
&primitive_query,
)?;
Ok(Some((qt, primitive_query))) Ok(Some((qt, primitive_query)))
} else { } else {
Ok(None) Ok(None)
@ -248,12 +255,7 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result<Option<O
} }
} }
Ok(best.map(|(_, left, right)| Operation::Phrase( Ok(best.map(|(_, left, right)| Operation::Phrase(vec![left.to_string(), right.to_string()])))
vec![
left.to_string(),
right.to_string()
]
)))
} }
/// Return the `QueryKind` of a word depending on `authorize_typos` /// Return the `QueryKind` of a word depending on `authorize_typos`
@ -276,12 +278,18 @@ fn synonyms(ctx: &impl Context, word: &[&str]) -> heed::Result<Option<Vec<Operat
let synonyms = ctx.synonyms(word)?; let synonyms = ctx.synonyms(word)?;
Ok(synonyms.map(|synonyms| { Ok(synonyms.map(|synonyms| {
synonyms.into_iter().map(|synonym| { synonyms
let words = synonym.into_iter().map(|word| { .into_iter()
.map(|synonym| {
let words = synonym
.into_iter()
.map(|word| {
Operation::Query(Query { prefix: false, kind: QueryKind::exact(word) }) Operation::Query(Query { prefix: false, kind: QueryKind::exact(word) })
}).collect(); })
.collect();
Operation::and(words) Operation::and(words)
}).collect() })
.collect()
})) }))
} }
@ -291,15 +299,13 @@ fn create_query_tree(
optional_words: bool, optional_words: bool,
authorize_typos: bool, authorize_typos: bool,
query: &[PrimitiveQueryPart], query: &[PrimitiveQueryPart],
) -> Result<Operation> ) -> Result<Operation> {
{
/// Matches on the `PrimitiveQueryPart` and create an operation from it. /// Matches on the `PrimitiveQueryPart` and create an operation from it.
fn resolve_primitive_part( fn resolve_primitive_part(
ctx: &impl Context, ctx: &impl Context,
authorize_typos: bool, authorize_typos: bool,
part: PrimitiveQueryPart, part: PrimitiveQueryPart,
) -> Result<Operation> ) -> Result<Operation> {
{
match part { match part {
// 1. try to split word in 2 // 1. try to split word in 2
// 2. try to fetch synonyms // 2. try to fetch synonyms
@ -310,13 +316,12 @@ fn create_query_tree(
if let Some(child) = split_best_frequency(ctx, &word)? { if let Some(child) = split_best_frequency(ctx, &word)? {
children.push(child); children.push(child);
} }
children.push(Operation::Query(Query { prefix, kind: typos(word, authorize_typos) })); children
.push(Operation::Query(Query { prefix, kind: typos(word, authorize_typos) }));
Ok(Operation::or(false, children)) Ok(Operation::or(false, children))
}, }
// create a CONSECUTIVE operation wrapping all word in the phrase // create a CONSECUTIVE operation wrapping all word in the phrase
PrimitiveQueryPart::Phrase(words) => { PrimitiveQueryPart::Phrase(words) => Ok(Operation::phrase(words)),
Ok(Operation::phrase(words))
},
} }
} }
@ -325,8 +330,7 @@ fn create_query_tree(
ctx: &impl Context, ctx: &impl Context,
authorize_typos: bool, authorize_typos: bool,
query: &[PrimitiveQueryPart], query: &[PrimitiveQueryPart],
) -> Result<Operation> ) -> Result<Operation> {
{
const MAX_NGRAM: usize = 3; const MAX_NGRAM: usize = 3;
let mut op_children = Vec::new(); let mut op_children = Vec::new();
@ -341,21 +345,26 @@ fn create_query_tree(
match group { match group {
[part] => { [part] => {
let operation = resolve_primitive_part(ctx, authorize_typos, part.clone())?; let operation =
resolve_primitive_part(ctx, authorize_typos, part.clone())?;
and_op_children.push(operation); and_op_children.push(operation);
}, }
words => { words => {
let is_prefix = words.last().map_or(false, |part| part.is_prefix()); let is_prefix = words.last().map_or(false, |part| part.is_prefix());
let words: Vec<_> = words.iter().filter_map(|part| { let words: Vec<_> = words
.iter()
.filter_map(|part| {
if let PrimitiveQueryPart::Word(word, _) = part { if let PrimitiveQueryPart::Word(word, _) = part {
Some(word.as_str()) Some(word.as_str())
} else { } else {
None None
} }
}).collect(); })
.collect();
let mut operations = synonyms(ctx, &words)?.unwrap_or_default(); let mut operations = synonyms(ctx, &words)?.unwrap_or_default();
let concat = words.concat(); let concat = words.concat();
let query = Query { prefix: is_prefix, kind: typos(concat, authorize_typos) }; let query =
Query { prefix: is_prefix, kind: typos(concat, authorize_typos) };
operations.push(Operation::Query(query)); operations.push(Operation::Query(query));
and_op_children.push(Operation::or(false, operations)); and_op_children.push(Operation::or(false, operations));
} }
@ -379,15 +388,16 @@ fn create_query_tree(
ctx: &impl Context, ctx: &impl Context,
authorize_typos: bool, authorize_typos: bool,
query: PrimitiveQuery, query: PrimitiveQuery,
) -> Result<Operation> ) -> Result<Operation> {
{
let number_phrases = query.iter().filter(|p| p.is_phrase()).count(); let number_phrases = query.iter().filter(|p| p.is_phrase()).count();
let mut operation_children = Vec::new(); let mut operation_children = Vec::new();
let start = number_phrases + (number_phrases == 0) as usize; let start = number_phrases + (number_phrases == 0) as usize;
for len in start..=query.len() { for len in start..=query.len() {
let mut word_count = len - number_phrases; let mut word_count = len - number_phrases;
let query: Vec<_> = query.iter().filter(|p| { let query: Vec<_> = query
.iter()
.filter(|p| {
if p.is_phrase() { if p.is_phrase() {
true true
} else if word_count != 0 { } else if word_count != 0 {
@ -434,7 +444,11 @@ impl PrimitiveQueryPart {
/// Create primitive query from tokenized query string, /// Create primitive query from tokenized query string,
/// the primitive query is an intermediate state to build the query tree. /// the primitive query is an intermediate state to build the query tree.
fn create_primitive_query(query: TokenStream, stop_words: Option<Set<&[u8]>>, words_limit: Option<usize>) -> PrimitiveQuery { fn create_primitive_query(
query: TokenStream,
stop_words: Option<Set<&[u8]>>,
words_limit: Option<usize>,
) -> PrimitiveQuery {
let mut primitive_query = Vec::new(); let mut primitive_query = Vec::new();
let mut phrase = Vec::new(); let mut phrase = Vec::new();
let mut quoted = false; let mut quoted = false;
@ -444,7 +458,9 @@ fn create_primitive_query(query: TokenStream, stop_words: Option<Set<&[u8]>>, wo
let mut peekable = query.peekable(); let mut peekable = query.peekable();
while let Some(token) = peekable.next() { while let Some(token) = peekable.next() {
// early return if word limit is exceeded // early return if word limit is exceeded
if primitive_query.len() >= parts_limit { return primitive_query } if primitive_query.len() >= parts_limit {
return primitive_query;
}
match token.kind { match token.kind {
TokenKind::Word | TokenKind::StopWord => { TokenKind::Word | TokenKind::StopWord => {
@ -454,13 +470,17 @@ fn create_primitive_query(query: TokenStream, stop_words: Option<Set<&[u8]>>, wo
if quoted { if quoted {
phrase.push(token.word.to_string()); phrase.push(token.word.to_string());
} else if peekable.peek().is_some() { } else if peekable.peek().is_some() {
if !stop_words.as_ref().map_or(false, |swords| swords.contains(token.word.as_ref())) { if !stop_words
primitive_query.push(PrimitiveQueryPart::Word(token.word.to_string(), false)); .as_ref()
.map_or(false, |swords| swords.contains(token.word.as_ref()))
{
primitive_query
.push(PrimitiveQueryPart::Word(token.word.to_string(), false));
} }
} else { } else {
primitive_query.push(PrimitiveQueryPart::Word(token.word.to_string(), true)); primitive_query.push(PrimitiveQueryPart::Word(token.word.to_string(), true));
} }
}, }
TokenKind::Separator(separator_kind) => { TokenKind::Separator(separator_kind) => {
let quote_count = token.word.chars().filter(|&s| s == '"').count(); let quote_count = token.word.chars().filter(|&s| s == '"').count();
// swap quoted state if we encounter a double quote // swap quoted state if we encounter a double quote
@ -468,10 +488,11 @@ fn create_primitive_query(query: TokenStream, stop_words: Option<Set<&[u8]>>, wo
quoted = !quoted; quoted = !quoted;
} }
// if there is a quote or a hard separator we close the phrase. // if there is a quote or a hard separator we close the phrase.
if !phrase.is_empty() && (quote_count > 0 || separator_kind == SeparatorKind::Hard) { if !phrase.is_empty() && (quote_count > 0 || separator_kind == SeparatorKind::Hard)
{
primitive_query.push(PrimitiveQueryPart::Phrase(mem::take(&mut phrase))); primitive_query.push(PrimitiveQueryPart::Phrase(mem::take(&mut phrase)));
} }
}, }
_ => (), _ => (),
} }
} }
@ -486,7 +507,7 @@ fn create_primitive_query(query: TokenStream, stop_words: Option<Set<&[u8]>>, wo
/// Returns the maximum number of typos that this Operation allows. /// Returns the maximum number of typos that this Operation allows.
pub fn maximum_typo(operation: &Operation) -> usize { pub fn maximum_typo(operation: &Operation) -> usize {
use Operation::{Or, And, Query, Phrase}; use Operation::{And, Or, Phrase, Query};
match operation { match operation {
Or(_, ops) => ops.iter().map(maximum_typo).max().unwrap_or(0), Or(_, ops) => ops.iter().map(maximum_typo).max().unwrap_or(0),
And(ops) => ops.iter().map(maximum_typo).sum::<usize>(), And(ops) => ops.iter().map(maximum_typo).sum::<usize>(),
@ -498,13 +519,12 @@ pub fn maximum_typo(operation: &Operation) -> usize {
/// Returns the maximum proximity that this Operation allows. /// Returns the maximum proximity that this Operation allows.
pub fn maximum_proximity(operation: &Operation) -> usize { pub fn maximum_proximity(operation: &Operation) -> usize {
use Operation::{Or, And, Query, Phrase}; use Operation::{And, Or, Phrase, Query};
match operation { match operation {
Or(_, ops) => ops.iter().map(maximum_proximity).max().unwrap_or(0), Or(_, ops) => ops.iter().map(maximum_proximity).max().unwrap_or(0),
And(ops) => { And(ops) => {
ops.iter().map(maximum_proximity).sum::<usize>() ops.iter().map(maximum_proximity).sum::<usize>() + ops.len().saturating_sub(1) * 7
+ ops.len().saturating_sub(1) * 7 }
},
Query(_) | Phrase(_) => 0, Query(_) | Phrase(_) => 0,
} }
} }
@ -515,7 +535,8 @@ mod test {
use maplit::hashmap; use maplit::hashmap;
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
use rand::{Rng, SeedableRng, rngs::StdRng}; use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use super::*; use super::*;
@ -532,11 +553,11 @@ mod test {
authorize_typos: bool, authorize_typos: bool,
words_limit: Option<usize>, words_limit: Option<usize>,
query: TokenStream, query: TokenStream,
) -> Result<Option<(Operation, PrimitiveQuery)>> ) -> Result<Option<(Operation, PrimitiveQuery)>> {
{
let primitive_query = create_primitive_query(query, None, words_limit); let primitive_query = create_primitive_query(query, None, words_limit);
if !primitive_query.is_empty() { if !primitive_query.is_empty() {
let qt = create_query_tree(self, optional_words, authorize_typos, &primitive_query)?; let qt =
create_query_tree(self, optional_words, authorize_typos, &primitive_query)?;
Ok(Some((qt, primitive_query))) Ok(Some((qt, primitive_query)))
} else { } else {
Ok(None) Ok(None)
@ -620,15 +641,28 @@ mod test {
let result = analyzer.analyze(query); let result = analyzer.analyze(query);
let tokens = result.tokens(); let tokens = result.tokens();
let expected = Operation::Or(false, vec![ let expected = Operation::Or(
false,
vec![
Operation::And(vec![ Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), Operation::Query(Query {
Operation::Query(Query { prefix: true, kind: QueryKind::tolerant(1, "friends".to_string()) }), prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Query(Query {
prefix: true,
kind: QueryKind::tolerant(1, "friends".to_string()),
}),
]), ]),
Operation::Query(Query { prefix: true, kind: QueryKind::tolerant(2, "heyfriends".to_string()) }), Operation::Query(Query {
]); prefix: true,
kind: QueryKind::tolerant(2, "heyfriends".to_string()),
}),
],
);
let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); let (query_tree, _) =
TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree); assert_eq!(expected, query_tree);
} }
@ -640,15 +674,28 @@ mod test {
let result = analyzer.analyze(query); let result = analyzer.analyze(query);
let tokens = result.tokens(); let tokens = result.tokens();
let expected = Operation::Or(false, vec![ let expected = Operation::Or(
false,
vec![
Operation::And(vec![ Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), Operation::Query(Query {
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "friends".to_string()) }), prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "friends".to_string()),
}),
]), ]),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "heyfriends".to_string()) }), Operation::Query(Query {
]); prefix: false,
kind: QueryKind::tolerant(2, "heyfriends".to_string()),
}),
],
);
let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); let (query_tree, _) =
TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree); assert_eq!(expected, query_tree);
} }
@ -660,26 +707,60 @@ mod test {
let result = analyzer.analyze(query); let result = analyzer.analyze(query);
let tokens = result.tokens(); let tokens = result.tokens();
let expected = Operation::Or(false, vec![ let expected = Operation::Or(
false,
vec![
Operation::And(vec![ Operation::And(vec![
Operation::Or(false, vec![ Operation::Or(
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hi".to_string()) }), false,
vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("hi".to_string()),
}),
Operation::And(vec![ Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }), Operation::Query(Query {
Operation::Query(Query { prefix: false, kind: QueryKind::exact("morning".to_string()) }), prefix: false,
kind: QueryKind::exact("good".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("morning".to_string()),
}),
]), ]),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "hello".to_string()) }), Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "hello".to_string()),
}),
],
),
Operation::Or(
false,
vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("earth".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("nature".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "world".to_string()),
}),
],
),
]), ]),
Operation::Or(false, vec![ Operation::Query(Query {
Operation::Query(Query { prefix: false, kind: QueryKind::exact("earth".to_string()) }), prefix: false,
Operation::Query(Query { prefix: false, kind: QueryKind::exact("nature".to_string()) }), kind: QueryKind::tolerant(2, "helloworld".to_string()),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "world".to_string()) }), }),
]), ],
]), );
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "helloworld".to_string()) }),
]);
let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); let (query_tree, _) =
TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree); assert_eq!(expected, query_tree);
} }
@ -691,40 +772,95 @@ mod test {
let result = analyzer.analyze(query); let result = analyzer.analyze(query);
let tokens = result.tokens(); let tokens = result.tokens();
let expected = Operation::Or(false, vec![ let expected = Operation::Or(
false,
vec![
Operation::And(vec![ Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("new".to_string()) }), Operation::Query(Query {
Operation::Or(false, vec![ prefix: false,
kind: QueryKind::exact("new".to_string()),
}),
Operation::Or(
false,
vec![
Operation::And(vec![ Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("york".to_string()) }), Operation::Query(Query {
Operation::Query(Query { prefix: false, kind: QueryKind::exact("city".to_string()) }), prefix: false,
]), kind: QueryKind::exact("york".to_string()),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "yorkcity".to_string()) }), }),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("city".to_string()),
}),
]), ]),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "yorkcity".to_string()),
}),
],
),
]), ]),
Operation::And(vec![ Operation::And(vec![
Operation::Or(false, vec![ Operation::Or(
Operation::Query(Query { prefix: false, kind: QueryKind::exact("nyc".to_string()) }), false,
vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("nyc".to_string()),
}),
Operation::And(vec![ Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("new".to_string()) }), Operation::Query(Query {
Operation::Query(Query { prefix: false, kind: QueryKind::exact("york".to_string()) }), prefix: false,
Operation::Query(Query { prefix: false, kind: QueryKind::exact("city".to_string()) }), kind: QueryKind::exact("new".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("york".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("city".to_string()),
}),
]), ]),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "newyork".to_string()) }), Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "newyork".to_string()),
}),
],
),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("city".to_string()),
}),
]), ]),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("city".to_string()) }), Operation::Or(
]), false,
Operation::Or(false, vec![ vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("nyc".to_string()) }), Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("nyc".to_string()),
}),
Operation::And(vec![ Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("new".to_string()) }), Operation::Query(Query {
Operation::Query(Query { prefix: false, kind: QueryKind::exact("york".to_string()) }), prefix: false,
kind: QueryKind::exact("new".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("york".to_string()),
}),
]), ]),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "newyorkcity".to_string()) }), Operation::Query(Query {
]), prefix: false,
]); kind: QueryKind::tolerant(2, "newyorkcity".to_string()),
}),
],
),
],
);
let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); let (query_tree, _) =
TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree); assert_eq!(expected, query_tree);
} }
@ -736,15 +872,28 @@ mod test {
let result = analyzer.analyze(query); let result = analyzer.analyze(query);
let tokens = result.tokens(); let tokens = result.tokens();
let expected = Operation::Or(false, vec![ let expected = Operation::Or(
false,
vec![
Operation::And(vec![ Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("n".to_string()) }), Operation::Query(Query {
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "grams".to_string()) }), prefix: false,
kind: QueryKind::exact("n".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "grams".to_string()),
}),
]), ]),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "ngrams".to_string()) }), Operation::Query(Query {
]); prefix: false,
kind: QueryKind::tolerant(1, "ngrams".to_string()),
}),
],
);
let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); let (query_tree, _) =
TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree); assert_eq!(expected, query_tree);
} }
@ -756,21 +905,34 @@ mod test {
let result = analyzer.analyze(query); let result = analyzer.analyze(query);
let tokens = result.tokens(); let tokens = result.tokens();
let expected = Operation::Or(false, vec![ let expected = Operation::Or(
false,
vec![
Operation::And(vec![ Operation::And(vec![
Operation::Or(false, vec![ Operation::Or(
Operation::Phrase(vec![ false,
"word".to_string(), vec![
"split".to_string(), Operation::Phrase(vec!["word".to_string(), "split".to_string()]),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(2, "wordsplit".to_string()),
}),
],
),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("fish".to_string()),
}),
]), ]),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "wordsplit".to_string()) }), Operation::Query(Query {
]), prefix: false,
Operation::Query(Query { prefix: false, kind: QueryKind::exact("fish".to_string()) }) kind: QueryKind::tolerant(2, "wordsplitfish".to_string()),
]), }),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "wordsplitfish".to_string()) }), ],
]); );
let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); let (query_tree, _) =
TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree); assert_eq!(expected, query_tree);
} }
@ -783,14 +945,12 @@ mod test {
let tokens = result.tokens(); let tokens = result.tokens();
let expected = Operation::And(vec![ let expected = Operation::And(vec![
Operation::Phrase(vec![ Operation::Phrase(vec!["hey".to_string(), "friends".to_string()]),
"hey".to_string(),
"friends".to_string(),
]),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }), Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }),
]); ]);
let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); let (query_tree, _) =
TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree); assert_eq!(expected, query_tree);
} }
@ -803,17 +963,12 @@ mod test {
let tokens = result.tokens(); let tokens = result.tokens();
let expected = Operation::And(vec![ let expected = Operation::And(vec![
Operation::Phrase(vec![ Operation::Phrase(vec!["hey".to_string(), "friends".to_string()]),
"hey".to_string(), Operation::Phrase(vec!["wooop".to_string(), "wooop".to_string()]),
"friends".to_string(),
]),
Operation::Phrase(vec![
"wooop".to_string(),
"wooop".to_string(),
]),
]); ]);
let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); let (query_tree, _) =
TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree); assert_eq!(expected, query_tree);
} }
@ -825,34 +980,80 @@ mod test {
let result = analyzer.analyze(query); let result = analyzer.analyze(query);
let tokens = result.tokens(); let tokens = result.tokens();
let expected = Operation::Or(true, vec![ let expected = Operation::Or(
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), true,
Operation::Or(false, vec![ vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Or(
false,
vec![
Operation::And(vec![ Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), Operation::Query(Query {
Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }), prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("my".to_string()),
}),
]), ]),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "heymy".to_string()) }), Operation::Query(Query {
]), prefix: false,
Operation::Or(false, vec![ kind: QueryKind::tolerant(1, "heymy".to_string()),
}),
],
),
Operation::Or(
false,
vec![
Operation::And(vec![ Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), Operation::Query(Query {
Operation::Or(false, vec![ prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Or(
false,
vec![
Operation::And(vec![ Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }), Operation::Query(Query {
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "friend".to_string()) }), prefix: false,
kind: QueryKind::exact("my".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "friend".to_string()),
}),
]), ]),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "myfriend".to_string()) }) Operation::Query(Query {
]) prefix: false,
kind: QueryKind::tolerant(1, "myfriend".to_string()),
}),
],
),
]), ]),
Operation::And(vec![ Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "heymy".to_string()) }), Operation::Query(Query {
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "friend".to_string()) }), prefix: false,
kind: QueryKind::tolerant(1, "heymy".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "friend".to_string()),
}),
]), ]),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "heymyfriend".to_string()) }), Operation::Query(Query {
]), prefix: false,
]); kind: QueryKind::tolerant(2, "heymyfriend".to_string()),
let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); }),
],
),
],
);
let (query_tree, _) =
TestContext::default().build(true, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree); assert_eq!(expected, query_tree);
} }
@ -864,11 +1065,9 @@ mod test {
let result = analyzer.analyze(query); let result = analyzer.analyze(query);
let tokens = result.tokens(); let tokens = result.tokens();
let expected = Operation::Phrase(vec![ let expected = Operation::Phrase(vec!["hey".to_string(), "my".to_string()]);
"hey".to_string(), let (query_tree, _) =
"my".to_string(), TestContext::default().build(true, true, None, tokens).unwrap().unwrap();
]);
let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree); assert_eq!(expected, query_tree);
} }
@ -880,29 +1079,66 @@ mod test {
let result = analyzer.analyze(query); let result = analyzer.analyze(query);
let tokens = result.tokens(); let tokens = result.tokens();
let expected = Operation::Or(true, vec![ let expected = Operation::Or(
true,
vec![
Operation::And(vec![ Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), Operation::Query(Query {
Operation::Query(Query { prefix: false, kind: QueryKind::exact("friend".to_string()) }), prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("friend".to_string()),
}),
]), ]),
Operation::And(vec![ Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), Operation::Query(Query {
Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }), prefix: false,
Operation::Query(Query { prefix: false, kind: QueryKind::exact("friend".to_string()) }), kind: QueryKind::exact("hey".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("my".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("friend".to_string()),
}),
]), ]),
Operation::And(vec![ Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), Operation::Query(Query {
Operation::Or(false, vec![ prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Or(
false,
vec![
Operation::And(vec![ Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }), Operation::Query(Query {
Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }), prefix: false,
kind: QueryKind::exact("my".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("good".to_string()),
}),
]), ]),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "mygood".to_string()) }), Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "mygood".to_string()),
}),
],
),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("friend".to_string()),
}),
]), ]),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("friend".to_string()) }), ],
]), );
]); let (query_tree, _) =
let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); TestContext::default().build(true, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree); assert_eq!(expected, query_tree);
} }
@ -914,14 +1150,27 @@ mod test {
let result = analyzer.analyze(query); let result = analyzer.analyze(query);
let tokens = result.tokens(); let tokens = result.tokens();
let expected = Operation::Or(false, vec![ let expected = Operation::Or(
false,
vec![
Operation::And(vec![ Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), Operation::Query(Query {
Operation::Query(Query { prefix: false, kind: QueryKind::exact("friends".to_string()) }), prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("friends".to_string()),
}),
]), ]),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("heyfriends".to_string()) }), Operation::Query(Query {
]); prefix: false,
let (query_tree, _) = TestContext::default().build(false, false, None, tokens).unwrap().unwrap(); kind: QueryKind::exact("heyfriends".to_string()),
}),
],
);
let (query_tree, _) =
TestContext::default().build(false, false, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree); assert_eq!(expected, query_tree);
} }
@ -934,14 +1183,12 @@ mod test {
let tokens = result.tokens(); let tokens = result.tokens();
let expected = Operation::And(vec![ let expected = Operation::And(vec![
Operation::Phrase(vec![ Operation::Phrase(vec!["hey".to_string(), "my".to_string()]),
"hey".to_string(),
"my".to_string(),
]),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }), Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }),
]); ]);
let (query_tree, _) = TestContext::default().build(false, false, Some(2), tokens).unwrap().unwrap(); let (query_tree, _) =
TestContext::default().build(false, false, Some(2), tokens).unwrap().unwrap();
assert_eq!(expected, query_tree); assert_eq!(expected, query_tree);
} }

View File

@ -1,6 +1,7 @@
use std::iter::{Chain, FromIterator}; use std::iter::{Chain, FromIterator};
use std::ops::RangeInclusive; use std::ops::RangeInclusive;
use roaring::bitmap::{RoaringBitmap, IntoIter};
use roaring::bitmap::{IntoIter, RoaringBitmap};
pub struct AvailableDocumentsIds { pub struct AvailableDocumentsIds {
iter: Chain<IntoIter, RangeInclusive<u32>>, iter: Chain<IntoIter, RangeInclusive<u32>>,
@ -18,16 +19,12 @@ impl AvailableDocumentsIds {
None => 1..=0, // empty range iterator None => 1..=0, // empty range iterator
}; };
AvailableDocumentsIds { AvailableDocumentsIds { iter: available.into_iter().chain(iter) }
iter: available.into_iter().chain(iter),
} }
},
None => { None => {
let empty = RoaringBitmap::new().into_iter(); let empty = RoaringBitmap::new().into_iter();
AvailableDocumentsIds { AvailableDocumentsIds { iter: empty.chain(0..=u32::max_value()) }
iter: empty.chain(0..=u32::max_value()),
} }
},
} }
} }
} }

View File

@ -1,7 +1,7 @@
use chrono::Utc; use chrono::Utc;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use crate::{ExternalDocumentsIds, Index, FieldsDistribution, Result}; use crate::{ExternalDocumentsIds, FieldsDistribution, Index, Result};
pub struct ClearDocuments<'t, 'u, 'i> { pub struct ClearDocuments<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>, wtxn: &'t mut heed::RwTxn<'i, 'u>,
@ -13,9 +13,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
pub fn new( pub fn new(
wtxn: &'t mut heed::RwTxn<'i, 'u>, wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index, index: &'i Index,
update_id: u64 update_id: u64,
) -> ClearDocuments<'t, 'u, 'i> { ) -> ClearDocuments<'t, 'u, 'i> {
ClearDocuments { wtxn, index, _update_id: update_id } ClearDocuments { wtxn, index, _update_id: update_id }
} }
@ -80,8 +79,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
mod tests { mod tests {
use heed::EnvOpenOptions; use heed::EnvOpenOptions;
use crate::update::{IndexDocuments, UpdateFormat};
use super::*; use super::*;
use crate::update::{IndexDocuments, UpdateFormat};
#[test] #[test]
fn clear_documents() { fn clear_documents() {

View File

@ -1,5 +1,5 @@
use std::collections::HashMap;
use std::collections::hash_map::Entry; use std::collections::hash_map::Entry;
use std::collections::HashMap;
use chrono::Utc; use chrono::Utc;
use fst::IntoStreamer; use fst::IntoStreamer;
@ -7,11 +7,11 @@ use heed::types::{ByteSlice, Unit};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use serde_json::Value; use serde_json::Value;
use crate::error::{InternalError, FieldIdMapMissingEntry, UserError}; use super::ClearDocuments;
use crate::error::{FieldIdMapMissingEntry, InternalError, UserError};
use crate::heed_codec::CboRoaringBitmapCodec; use crate::heed_codec::CboRoaringBitmapCodec;
use crate::index::{db_name, main_key}; use crate::index::{db_name, main_key};
use crate::{Index, DocumentId, FieldId, BEU32, SmallString32, ExternalDocumentsIds, Result}; use crate::{DocumentId, ExternalDocumentsIds, FieldId, Index, Result, SmallString32, BEU32};
use super::ClearDocuments;
pub struct DeleteDocuments<'t, 'u, 'i> { pub struct DeleteDocuments<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>, wtxn: &'t mut heed::RwTxn<'i, 'u>,
@ -26,11 +26,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>, wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index, index: &'i Index,
update_id: u64, update_id: u64,
) -> Result<DeleteDocuments<'t, 'u, 'i>> ) -> Result<DeleteDocuments<'t, 'u, 'i>> {
{ let external_documents_ids = index.external_documents_ids(wtxn)?.into_static();
let external_documents_ids = index
.external_documents_ids(wtxn)?
.into_static();
Ok(DeleteDocuments { Ok(DeleteDocuments {
wtxn, wtxn,
@ -84,11 +81,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
key: Some(main_key::PRIMARY_KEY_KEY), key: Some(main_key::PRIMARY_KEY_KEY),
} }
})?; })?;
let id_field = fields_ids_map.id(primary_key).ok_or_else(|| { let id_field =
FieldIdMapMissingEntry::FieldName { fields_ids_map.id(primary_key).ok_or_else(|| FieldIdMapMissingEntry::FieldName {
field_name: primary_key.to_string(), field_name: primary_key.to_string(),
process: "DeleteDocuments::execute", process: "DeleteDocuments::execute",
}
})?; })?;
let Index { let Index {
@ -130,7 +126,9 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
let external_id = match serde_json::from_slice(content).unwrap() { let external_id = match serde_json::from_slice(content).unwrap() {
Value::String(string) => SmallString32::from(string.as_str()), Value::String(string) => SmallString32::from(string.as_str()),
Value::Number(number) => SmallString32::from(number.to_string()), Value::Number(number) => SmallString32::from(number.to_string()),
document_id => return Err(UserError::InvalidDocumentId { document_id }.into()), document_id => {
return Err(UserError::InvalidDocumentId { document_id }.into())
}
}; };
external_ids.push(external_id); external_ids.push(external_id);
} }
@ -160,7 +158,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
if let Entry::Occupied(mut entry) = fields_distribution.entry(field_name.to_string()) { if let Entry::Occupied(mut entry) = fields_distribution.entry(field_name.to_string()) {
match entry.get().checked_sub(count_diff) { match entry.get().checked_sub(count_diff) {
Some(0) | None => entry.remove(), Some(0) | None => entry.remove(),
Some(count) => entry.insert(count) Some(count) => entry.insert(count),
}; };
} }
} }
@ -206,9 +204,16 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
} }
// We construct an FST set that contains the words to delete from the words FST. // We construct an FST set that contains the words to delete from the words FST.
let words_to_delete = words.iter().filter_map(|(word, must_remove)| { let words_to_delete =
if *must_remove { Some(word.as_ref()) } else { None } words.iter().filter_map(
}); |(word, must_remove)| {
if *must_remove {
Some(word.as_ref())
} else {
None
}
},
);
let words_to_delete = fst::Set::from_iter(words_to_delete)?; let words_to_delete = fst::Set::from_iter(words_to_delete)?;
let new_words_fst = { let new_words_fst = {
@ -285,7 +290,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
// We delete the documents ids that are under the pairs of words, // We delete the documents ids that are under the pairs of words,
// it is faster and use no memory to iterate over all the words pairs than // it is faster and use no memory to iterate over all the words pairs than
// to compute the cartesian product of every words of the deleted documents. // to compute the cartesian product of every words of the deleted documents.
let mut iter = word_pair_proximity_docids.remap_key_type::<ByteSlice>().iter_mut(self.wtxn)?; let mut iter =
word_pair_proximity_docids.remap_key_type::<ByteSlice>().iter_mut(self.wtxn)?;
while let Some(result) = iter.next() { while let Some(result) = iter.next() {
let (bytes, mut docids) = result?; let (bytes, mut docids) = result?;
let previous_len = docids.len(); let previous_len = docids.len();
@ -300,7 +306,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
drop(iter); drop(iter);
// We delete the documents ids that are under the word level position docids. // We delete the documents ids that are under the word level position docids.
let mut iter = word_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>(); let mut iter =
word_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
while let Some(result) = iter.next() { while let Some(result) = iter.next() {
let (bytes, mut docids) = result?; let (bytes, mut docids) = result?;
let previous_len = docids.len(); let previous_len = docids.len();
@ -315,7 +322,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
drop(iter); drop(iter);
// We delete the documents ids that are under the word prefix level position docids. // We delete the documents ids that are under the word prefix level position docids.
let mut iter = word_prefix_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>(); let mut iter =
word_prefix_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
while let Some(result) = iter.next() { while let Some(result) = iter.next() {
let (bytes, mut docids) = result?; let (bytes, mut docids) = result?;
let previous_len = docids.len(); let previous_len = docids.len();
@ -400,9 +408,8 @@ where
C: heed::BytesDecode<'a, DItem = K> + heed::BytesEncode<'a, EItem = K>, C: heed::BytesDecode<'a, DItem = K> + heed::BytesEncode<'a, EItem = K>,
F: Fn(K) -> DocumentId, F: Fn(K) -> DocumentId,
{ {
let mut iter = db.remap_key_type::<ByteSlice>() let mut iter =
.prefix_iter_mut(wtxn, &[field_id])? db.remap_key_type::<ByteSlice>().prefix_iter_mut(wtxn, &[field_id])?.remap_key_type::<C>();
.remap_key_type::<C>();
while let Some(result) = iter.next() { while let Some(result) = iter.next() {
let (key, ()) = result?; let (key, ()) = result?;
@ -441,8 +448,8 @@ where
mod tests { mod tests {
use heed::EnvOpenOptions; use heed::EnvOpenOptions;
use crate::update::{IndexDocuments, UpdateFormat};
use super::*; use super::*;
use crate::update::{IndexDocuments, UpdateFormat};
#[test] #[test]
fn delete_documents_with_numbers_as_primary_key() { fn delete_documents_with_numbers_as_primary_key() {

View File

@ -3,17 +3,18 @@ use std::fs::File;
use std::num::NonZeroUsize; use std::num::NonZeroUsize;
use chrono::Utc; use chrono::Utc;
use grenad::{CompressionType, Reader, Writer, FileFuse}; use grenad::{CompressionType, FileFuse, Reader, Writer};
use heed::types::{ByteSlice, DecodeIgnore}; use heed::types::{ByteSlice, DecodeIgnore};
use heed::{BytesEncode, Error}; use heed::{BytesEncode, Error};
use log::debug; use log::debug;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use crate::error::InternalError; use crate::error::InternalError;
use crate::heed_codec::CboRoaringBitmapCodec;
use crate::heed_codec::facet::FacetLevelValueF64Codec; use crate::heed_codec::facet::FacetLevelValueF64Codec;
use crate::update::index_documents::WriteMethod; use crate::heed_codec::CboRoaringBitmapCodec;
use crate::update::index_documents::{create_writer, writer_into_reader, write_into_lmdb_database}; use crate::update::index_documents::{
create_writer, write_into_lmdb_database, writer_into_reader, WriteMethod,
};
use crate::{Index, Result}; use crate::{Index, Result};
pub struct Facets<'t, 'u, 'i> { pub struct Facets<'t, 'u, 'i> {
@ -32,8 +33,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>, wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index, index: &'i Index,
update_id: u64, update_id: u64,
) -> Facets<'t, 'u, 'i> ) -> Facets<'t, 'u, 'i> {
{
Facets { Facets {
wtxn, wtxn,
index, index,
@ -72,11 +72,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
)?; )?;
// Clear the facet number levels. // Clear the facet number levels.
clear_field_number_levels( clear_field_number_levels(self.wtxn, self.index.facet_id_f64_docids, field_id)?;
self.wtxn,
self.index.facet_id_f64_docids,
field_id,
)?;
// Compute and store the faceted numbers documents ids. // Compute and store the faceted numbers documents ids.
let number_documents_ids = compute_faceted_documents_ids( let number_documents_ids = compute_faceted_documents_ids(
@ -96,8 +92,16 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
field_id, field_id,
)?; )?;
self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &string_documents_ids)?; self.index.put_string_faceted_documents_ids(
self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &number_documents_ids)?; self.wtxn,
field_id,
&string_documents_ids,
)?;
self.index.put_number_faceted_documents_ids(
self.wtxn,
field_id,
&number_documents_ids,
)?;
write_into_lmdb_database( write_into_lmdb_database(
self.wtxn, self.wtxn,
@ -112,12 +116,11 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
} }
} }
fn clear_field_number_levels<'t, >( fn clear_field_number_levels<'t>(
wtxn: &'t mut heed::RwTxn, wtxn: &'t mut heed::RwTxn,
db: heed::Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>, db: heed::Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
field_id: u8, field_id: u8,
) -> heed::Result<()> ) -> heed::Result<()> {
{
let left = (field_id, 1, f64::MIN, f64::MIN); let left = (field_id, 1, f64::MIN, f64::MIN);
let right = (field_id, u8::MAX, f64::MAX, f64::MAX); let right = (field_id, u8::MAX, f64::MAX, f64::MAX);
let range = left..=right; let range = left..=right;
@ -133,8 +136,7 @@ fn compute_facet_number_levels<'t>(
level_group_size: NonZeroUsize, level_group_size: NonZeroUsize,
min_level_size: NonZeroUsize, min_level_size: NonZeroUsize,
field_id: u8, field_id: u8,
) -> Result<Reader<FileFuse>> ) -> Result<Reader<FileFuse>> {
{
let first_level_size = db let first_level_size = db
.remap_key_type::<ByteSlice>() .remap_key_type::<ByteSlice>()
.prefix_iter(rtxn, &[field_id])? .prefix_iter(rtxn, &[field_id])?
@ -143,9 +145,8 @@ fn compute_facet_number_levels<'t>(
// It is forbidden to keep a cursor and write in a database at the same time with LMDB // It is forbidden to keep a cursor and write in a database at the same time with LMDB
// therefore we write the facet levels entries into a grenad file before transfering them. // therefore we write the facet levels entries into a grenad file before transfering them.
let mut writer = tempfile::tempfile().and_then(|file| { let mut writer = tempfile::tempfile()
create_writer(compression_type, compression_level, file) .and_then(|file| create_writer(compression_type, compression_level, file))?;
})?;
let level_0_range = { let level_0_range = {
let left = (field_id, 0, f64::MIN, f64::MIN); let left = (field_id, 0, f64::MIN, f64::MIN);
@ -196,8 +197,7 @@ fn compute_faceted_documents_ids(
rtxn: &heed::RoTxn, rtxn: &heed::RoTxn,
db: heed::Database<ByteSlice, CboRoaringBitmapCodec>, db: heed::Database<ByteSlice, CboRoaringBitmapCodec>,
field_id: u8, field_id: u8,
) -> Result<RoaringBitmap> ) -> Result<RoaringBitmap> {
{
let mut documents_ids = RoaringBitmap::new(); let mut documents_ids = RoaringBitmap::new();
for result in db.prefix_iter(rtxn, &[field_id])? { for result in db.prefix_iter(rtxn, &[field_id])? {
@ -215,8 +215,7 @@ fn write_number_entry(
left: f64, left: f64,
right: f64, right: f64,
ids: &RoaringBitmap, ids: &RoaringBitmap,
) -> Result<()> ) -> Result<()> {
{
let key = (field_id, level, left, right); let key = (field_id, level, left, right);
let key = FacetLevelValueF64Codec::bytes_encode(&key).ok_or(Error::Encoding)?; let key = FacetLevelValueF64Codec::bytes_encode(&key).ok_or(Error::Encoding)?;
let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?;

View File

@ -1,7 +1,7 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::HashSet; use std::collections::HashSet;
use std::fs::File; use std::fs::File;
use std::io::{self, Seek, SeekFrom, BufReader, BufRead}; use std::io::{self, BufRead, BufReader, Seek, SeekFrom};
use std::num::{NonZeroU32, NonZeroUsize}; use std::num::{NonZeroU32, NonZeroUsize};
use std::result::Result as StdResult; use std::result::Result as StdResult;
use std::str; use std::str;
@ -10,28 +10,26 @@ use std::time::Instant;
use bstr::ByteSlice as _; use bstr::ByteSlice as _;
use chrono::Utc; use chrono::Utc;
use grenad::{MergerIter, Writer, Sorter, Merger, Reader, FileFuse, CompressionType}; use grenad::{CompressionType, FileFuse, Merger, MergerIter, Reader, Sorter, Writer};
use heed::types::ByteSlice; use heed::types::ByteSlice;
use log::{debug, info, error}; use log::{debug, error, info};
use memmap::Mmap; use memmap::Mmap;
use rayon::prelude::*; use rayon::prelude::*;
use rayon::ThreadPool; use rayon::ThreadPool;
use serde::{Serialize, Deserialize}; use serde::{Deserialize, Serialize};
use crate::error::{Error, InternalError};
use crate::{Index, Result};
use crate::update::{
Facets, WordsLevelPositions, WordPrefixDocids, WordsPrefixesFst, UpdateIndexingStep,
WordPrefixPairProximityDocids,
};
use self::store::{Store, Readers};
pub use self::merge_function::{ pub use self::merge_function::{
fst_merge, cbo_roaring_bitmap_merge, roaring_bitmap_merge, keep_first cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge,
}; };
use self::store::{Readers, Store};
pub use self::transform::{Transform, TransformOutput}; pub use self::transform::{Transform, TransformOutput};
use crate::MergeFn;
use super::UpdateBuilder; use super::UpdateBuilder;
use crate::error::{Error, InternalError};
use crate::update::{
Facets, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids,
WordsLevelPositions, WordsPrefixesFst,
};
use crate::{Index, MergeFn, Result};
mod merge_function; mod merge_function;
mod store; mod store;
@ -48,7 +46,11 @@ pub enum WriteMethod {
GetMergePut, GetMergePut,
} }
pub fn create_writer(typ: CompressionType, level: Option<u32>, file: File) -> io::Result<Writer<File>> { pub fn create_writer(
typ: CompressionType,
level: Option<u32>,
file: File,
) -> io::Result<Writer<File>> {
let mut builder = Writer::builder(); let mut builder = Writer::builder();
builder.compression_type(typ); builder.compression_type(typ);
if let Some(level) = level { if let Some(level) = level {
@ -64,8 +66,7 @@ pub fn create_sorter<E>(
chunk_fusing_shrink_size: Option<u64>, chunk_fusing_shrink_size: Option<u64>,
max_nb_chunks: Option<usize>, max_nb_chunks: Option<usize>,
max_memory: Option<usize>, max_memory: Option<usize>,
) -> Sorter<MergeFn<E>> ) -> Sorter<MergeFn<E>> {
{
let mut builder = Sorter::builder(merge); let mut builder = Sorter::builder(merge);
if let Some(shrink_size) = chunk_fusing_shrink_size { if let Some(shrink_size) = chunk_fusing_shrink_size {
builder.file_fusing_shrink_size(shrink_size); builder.file_fusing_shrink_size(shrink_size);
@ -83,7 +84,10 @@ pub fn create_sorter<E>(
builder.build() builder.build()
} }
pub fn writer_into_reader(writer: Writer<File>, shrink_size: Option<u64>) -> Result<Reader<FileFuse>> { pub fn writer_into_reader(
writer: Writer<File>,
shrink_size: Option<u64>,
) -> Result<Reader<FileFuse>> {
let mut file = writer.into_inner()?; let mut file = writer.into_inner()?;
file.seek(SeekFrom::Start(0))?; file.seek(SeekFrom::Start(0))?;
let file = if let Some(shrink_size) = shrink_size { let file = if let Some(shrink_size) = shrink_size {
@ -97,8 +101,7 @@ pub fn writer_into_reader(writer: Writer<File>, shrink_size: Option<u64>) -> Res
pub fn merge_readers<E>( pub fn merge_readers<E>(
sources: Vec<Reader<FileFuse>>, sources: Vec<Reader<FileFuse>>,
merge: MergeFn<E>, merge: MergeFn<E>,
) -> Merger<FileFuse, MergeFn<E>> ) -> Merger<FileFuse, MergeFn<E>> {
{
let mut builder = Merger::builder(merge); let mut builder = Merger::builder(merge);
builder.extend(sources); builder.extend(sources);
builder.build() builder.build()
@ -118,13 +121,7 @@ where
let before = Instant::now(); let before = Instant::now();
let merger = merge_readers(sources, merge); let merger = merge_readers(sources, merge);
merger_iter_into_lmdb_database( merger_iter_into_lmdb_database(wtxn, database, merger.into_merge_iter()?, merge, method)?;
wtxn,
database,
merger.into_merge_iter()?,
merge,
method,
)?;
debug!("MTBL stores merged in {:.02?}!", before.elapsed()); debug!("MTBL stores merged in {:.02?}!", before.elapsed());
Ok(()) Ok(())
@ -149,7 +146,7 @@ where
while let Some((k, v)) = reader.next()? { while let Some((k, v)) = reader.next()? {
out_iter.append(k, v)?; out_iter.append(k, v)?;
} }
}, }
WriteMethod::GetMergePut => { WriteMethod::GetMergePut => {
while let Some((k, v)) = reader.next()? { while let Some((k, v)) = reader.next()? {
let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?;
@ -158,11 +155,11 @@ where
let vals = &[Cow::Borrowed(old_val), Cow::Borrowed(v)][..]; let vals = &[Cow::Borrowed(old_val), Cow::Borrowed(v)][..];
let val = merge(k, &vals)?; let val = merge(k, &vals)?;
iter.put_current(k, &val)?; iter.put_current(k, &val)?;
}, }
_ => { _ => {
drop(iter); drop(iter);
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
}, }
} }
} }
} }
@ -181,18 +178,12 @@ pub fn sorter_into_lmdb_database<E>(
) -> Result<()> ) -> Result<()>
where where
Error: From<E>, Error: From<E>,
Error: From<grenad::Error<E>> Error: From<grenad::Error<E>>,
{ {
debug!("Writing MTBL sorter..."); debug!("Writing MTBL sorter...");
let before = Instant::now(); let before = Instant::now();
merger_iter_into_lmdb_database( merger_iter_into_lmdb_database(wtxn, database, sorter.into_iter()?, merge, method)?;
wtxn,
database,
sorter.into_iter()?,
merge,
method,
)?;
debug!("MTBL sorter writen in {:.02?}!", before.elapsed()); debug!("MTBL sorter writen in {:.02?}!", before.elapsed());
Ok(()) Ok(())
@ -214,7 +205,7 @@ where
while let Some((k, v)) = sorter.next()? { while let Some((k, v)) = sorter.next()? {
out_iter.append(k, v)?; out_iter.append(k, v)?;
} }
}, }
WriteMethod::GetMergePut => { WriteMethod::GetMergePut => {
while let Some((k, v)) = sorter.next()? { while let Some((k, v)) = sorter.next()? {
let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?;
@ -226,14 +217,14 @@ where
InternalError::IndexingMergingKeys { process: "get-put-merge" } InternalError::IndexingMergingKeys { process: "get-put-merge" }
})?; })?;
iter.put_current(k, &val)?; iter.put_current(k, &val)?;
}, }
_ => { _ => {
drop(iter); drop(iter);
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
},
} }
} }
}, }
}
} }
Ok(()) Ok(())
@ -341,9 +332,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
// Early return when there is no document to add // Early return when there is no document to add
if reader.buffer().is_empty() { if reader.buffer().is_empty() {
return Ok(DocumentAdditionResult { return Ok(DocumentAdditionResult { nb_documents: 0 });
nb_documents: 0,
})
} }
self.index.set_updated_at(self.wtxn, &Utc::now())?; self.index.set_updated_at(self.wtxn, &Utc::now())?;
@ -367,7 +356,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
let output = match self.update_format { let output = match self.update_format {
UpdateFormat::Csv => transform.output_from_csv(reader, &progress_callback)?, UpdateFormat::Csv => transform.output_from_csv(reader, &progress_callback)?,
UpdateFormat::Json => transform.output_from_json(reader, &progress_callback)?, UpdateFormat::Json => transform.output_from_json(reader, &progress_callback)?,
UpdateFormat::JsonStream => transform.output_from_json_stream(reader, &progress_callback)?, UpdateFormat::JsonStream => {
transform.output_from_json_stream(reader, &progress_callback)?
}
}; };
let nb_documents = output.documents_count; let nb_documents = output.documents_count;
@ -380,7 +371,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
pub fn execute_raw<F>(self, output: TransformOutput, progress_callback: F) -> Result<()> pub fn execute_raw<F>(self, output: TransformOutput, progress_callback: F) -> Result<()>
where where
F: Fn(UpdateIndexingStep) + Sync F: Fn(UpdateIndexingStep) + Sync,
{ {
let before_indexing = Instant::now(); let before_indexing = Instant::now();
@ -457,7 +448,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
// settings if none have already been set. // settings if none have already been set.
backup_pool = rayon::ThreadPoolBuilder::new().build()?; backup_pool = rayon::ThreadPoolBuilder::new().build()?;
&backup_pool &backup_pool
}, }
}; };
let readers = pool.install(|| { let readers = pool.install(|| {
@ -595,11 +586,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
let mut documents_ids = self.index.documents_ids(self.wtxn)?; let mut documents_ids = self.index.documents_ids(self.wtxn)?;
let contains_documents = !documents_ids.is_empty(); let contains_documents = !documents_ids.is_empty();
let write_method = if contains_documents { let write_method =
WriteMethod::GetMergePut if contains_documents { WriteMethod::GetMergePut } else { WriteMethod::Append };
} else {
WriteMethod::Append
};
debug!("Writing using the write method: {:?}", write_method); debug!("Writing using the write method: {:?}", write_method);
@ -634,7 +622,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
*self.index.docid_word_positions.as_polymorph(), *self.index.docid_word_positions.as_polymorph(),
docid_word_positions_readers, docid_word_positions_readers,
keep_first, keep_first,
write_method write_method,
)?; )?;
database_count += 1; database_count += 1;
@ -649,7 +637,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
*self.index.documents.as_polymorph(), *self.index.documents.as_polymorph(),
documents_readers, documents_readers,
keep_first, keep_first,
write_method write_method,
)?; )?;
database_count += 1; database_count += 1;
@ -730,7 +718,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
fst_merge, fst_merge,
WriteMethod::GetMergePut, WriteMethod::GetMergePut,
)?; )?;
}, }
DatabaseType::WordDocids => { DatabaseType::WordDocids => {
debug!("Writing the words docids into LMDB on disk..."); debug!("Writing the words docids into LMDB on disk...");
let db = *self.index.word_docids.as_polymorph(); let db = *self.index.word_docids.as_polymorph();
@ -741,7 +729,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
roaring_bitmap_merge, roaring_bitmap_merge,
write_method, write_method,
)?; )?;
}, }
DatabaseType::FacetLevel0NumbersDocids => { DatabaseType::FacetLevel0NumbersDocids => {
debug!("Writing the facet numbers docids into LMDB on disk..."); debug!("Writing the facet numbers docids into LMDB on disk...");
let db = *self.index.facet_id_f64_docids.as_polymorph(); let db = *self.index.facet_id_f64_docids.as_polymorph();
@ -752,7 +740,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
cbo_roaring_bitmap_merge, cbo_roaring_bitmap_merge,
write_method, write_method,
)?; )?;
}, }
DatabaseType::FieldIdWordCountDocids => { DatabaseType::FieldIdWordCountDocids => {
debug!("Writing the field id word count docids into LMDB on disk..."); debug!("Writing the field id word count docids into LMDB on disk...");
let db = *self.index.field_id_word_count_docids.as_polymorph(); let db = *self.index.field_id_word_count_docids.as_polymorph();
@ -763,7 +751,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
cbo_roaring_bitmap_merge, cbo_roaring_bitmap_merge,
write_method, write_method,
)?; )?;
}, }
DatabaseType::WordLevel0PositionDocids => { DatabaseType::WordLevel0PositionDocids => {
debug!("Writing the word level 0 positions docids into LMDB on disk..."); debug!("Writing the word level 0 positions docids into LMDB on disk...");
let db = *self.index.word_level_position_docids.as_polymorph(); let db = *self.index.word_level_position_docids.as_polymorph();
@ -848,9 +836,10 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*;
use heed::EnvOpenOptions; use heed::EnvOpenOptions;
use super::*;
#[test] #[test]
fn simple_document_replacement() { fn simple_document_replacement() {
let path = tempfile::tempdir().unwrap(); let path = tempfile::tempdir().unwrap();
@ -1053,9 +1042,8 @@ mod tests {
assert_eq!(count, 3); assert_eq!(count, 3);
let docs = index.documents(&rtxn, vec![0, 1, 2]).unwrap(); let docs = index.documents(&rtxn, vec![0, 1, 2]).unwrap();
let (kevin_id, _) = docs.iter().find(|(_, d)| { let (kevin_id, _) =
d.get(0).unwrap() == br#""updated kevin""# docs.iter().find(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap();
}).unwrap();
let (id, doc) = docs[*kevin_id as usize]; let (id, doc) = docs[*kevin_id as usize];
assert_eq!(id, *kevin_id); assert_eq!(id, *kevin_id);

View File

@ -8,25 +8,29 @@ use std::{cmp, iter};
use bstr::ByteSlice as _; use bstr::ByteSlice as _;
use fst::Set; use fst::Set;
use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType}; use grenad::{CompressionType, FileFuse, Reader, Sorter, Writer};
use heed::BytesEncode; use heed::BytesEncode;
use linked_hash_map::LinkedHashMap; use linked_hash_map::LinkedHashMap;
use log::{debug, info}; use log::{debug, info};
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind, token::SeparatorKind}; use meilisearch_tokenizer::token::SeparatorKind;
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind};
use ordered_float::OrderedFloat; use ordered_float::OrderedFloat;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use serde_json::Value; use serde_json::Value;
use tempfile::tempfile; use tempfile::tempfile;
use super::merge_function::{
cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge,
};
use super::{create_sorter, create_writer, writer_into_reader, MergeFn};
use crate::error::{Error, InternalError, SerializationError}; use crate::error::{Error, InternalError, SerializationError};
use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec}; use crate::heed_codec::facet::{
use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec}; FacetLevelValueF64Codec, FacetValueStringCodec, FieldDocIdFacetF64Codec,
FieldDocIdFacetStringCodec,
};
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
use crate::update::UpdateIndexingStep; use crate::update::UpdateIndexingStep;
use crate::{json_to_string, SmallVec32, Position, DocumentId, FieldId, Result}; use crate::{json_to_string, DocumentId, FieldId, Position, Result, SmallVec32};
use super::{MergeFn, create_writer, create_sorter, writer_into_reader};
use super::merge_function::{fst_merge, keep_first, roaring_bitmap_merge, cbo_roaring_bitmap_merge};
const LMDB_MAX_KEY_LENGTH: usize = 511; const LMDB_MAX_KEY_LENGTH: usize = 511;
const ONE_KILOBYTE: usize = 1024 * 1024; const ONE_KILOBYTE: usize = 1024 * 1024;
@ -56,7 +60,8 @@ pub struct Store<'s, A> {
word_docids: LinkedHashMap<SmallVec32<u8>, RoaringBitmap>, word_docids: LinkedHashMap<SmallVec32<u8>, RoaringBitmap>,
word_docids_limit: usize, word_docids_limit: usize,
field_id_word_count_docids: HashMap<(FieldId, u8), RoaringBitmap>, field_id_word_count_docids: HashMap<(FieldId, u8), RoaringBitmap>,
words_pairs_proximities_docids: LinkedHashMap<(SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap>, words_pairs_proximities_docids:
LinkedHashMap<(SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap>,
words_pairs_proximities_docids_limit: usize, words_pairs_proximities_docids_limit: usize,
facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat<f64>), RoaringBitmap>, facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat<f64>), RoaringBitmap>,
facet_field_string_docids: LinkedHashMap<(FieldId, String), RoaringBitmap>, facet_field_string_docids: LinkedHashMap<(FieldId, String), RoaringBitmap>,
@ -93,8 +98,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
chunk_compression_level: Option<u32>, chunk_compression_level: Option<u32>,
chunk_fusing_shrink_size: Option<u64>, chunk_fusing_shrink_size: Option<u64>,
stop_words: Option<&'s Set<A>>, stop_words: Option<&'s Set<A>>,
) -> Result<Self> ) -> Result<Self> {
{
// We divide the max memory by the number of sorter the Store have. // We divide the max memory by the number of sorter the Store have.
let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 5)); let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 5));
let linked_hash_map_size = linked_hash_map_size.unwrap_or(500); let linked_hash_map_size = linked_hash_map_size.unwrap_or(500);
@ -172,12 +176,10 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
Some(1024 * 1024 * 1024), // 1MB Some(1024 * 1024 * 1024), // 1MB
); );
let documents_writer = tempfile().and_then(|f| { let documents_writer = tempfile()
create_writer(chunk_compression_type, chunk_compression_level, f) .and_then(|f| create_writer(chunk_compression_type, chunk_compression_level, f))?;
})?; let docid_word_positions_writer = tempfile()
let docid_word_positions_writer = tempfile().and_then(|f| { .and_then(|f| create_writer(chunk_compression_type, chunk_compression_level, f))?;
create_writer(chunk_compression_type, chunk_compression_level, f)
})?;
let mut config = AnalyzerConfig::default(); let mut config = AnalyzerConfig::default();
if let Some(stop_words) = stop_words { if let Some(stop_words) = stop_words {
@ -224,7 +226,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
fn insert_word_docid(&mut self, word: &str, id: DocumentId) -> Result<()> { fn insert_word_docid(&mut self, word: &str, id: DocumentId) -> Result<()> {
// if get_refresh finds the element it is assured to be at the end of the linked hash map. // if get_refresh finds the element it is assured to be at the end of the linked hash map.
match self.word_docids.get_refresh(word.as_bytes()) { match self.word_docids.get_refresh(word.as_bytes()) {
Some(old) => { old.insert(id); }, Some(old) => {
old.insert(id);
}
None => { None => {
let word_vec = SmallVec32::from(word.as_bytes()); let word_vec = SmallVec32::from(word.as_bytes());
// A newly inserted element is append at the end of the linked hash map. // A newly inserted element is append at the end of the linked hash map.
@ -246,15 +250,16 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
field_id: FieldId, field_id: FieldId,
value: OrderedFloat<f64>, value: OrderedFloat<f64>,
id: DocumentId, id: DocumentId,
) -> Result<()> ) -> Result<()> {
{
let sorter = &mut self.field_id_docid_facet_numbers_sorter; let sorter = &mut self.field_id_docid_facet_numbers_sorter;
Self::write_field_id_docid_facet_number_value(sorter, field_id, id, value)?; Self::write_field_id_docid_facet_number_value(sorter, field_id, id, value)?;
let key = (field_id, value); let key = (field_id, value);
// if get_refresh finds the element it is assured to be at the end of the linked hash map. // if get_refresh finds the element it is assured to be at the end of the linked hash map.
match self.facet_field_number_docids.get_refresh(&key) { match self.facet_field_number_docids.get_refresh(&key) {
Some(old) => { old.insert(id); }, Some(old) => {
old.insert(id);
}
None => { None => {
// A newly inserted element is append at the end of the linked hash map. // A newly inserted element is append at the end of the linked hash map.
self.facet_field_number_docids.insert(key, RoaringBitmap::from_iter(Some(id))); self.facet_field_number_docids.insert(key, RoaringBitmap::from_iter(Some(id)));
@ -279,15 +284,16 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
field_id: FieldId, field_id: FieldId,
value: String, value: String,
id: DocumentId, id: DocumentId,
) -> Result<()> ) -> Result<()> {
{
let sorter = &mut self.field_id_docid_facet_strings_sorter; let sorter = &mut self.field_id_docid_facet_strings_sorter;
Self::write_field_id_docid_facet_string_value(sorter, field_id, id, &value)?; Self::write_field_id_docid_facet_string_value(sorter, field_id, id, &value)?;
let key = (field_id, value); let key = (field_id, value);
// if get_refresh finds the element it is assured to be at the end of the linked hash map. // if get_refresh finds the element it is assured to be at the end of the linked hash map.
match self.facet_field_string_docids.get_refresh(&key) { match self.facet_field_string_docids.get_refresh(&key) {
Some(old) => { old.insert(id); }, Some(old) => {
old.insert(id);
}
None => { None => {
// A newly inserted element is append at the end of the linked hash map. // A newly inserted element is append at the end of the linked hash map.
self.facet_field_string_docids.insert(key, RoaringBitmap::from_iter(Some(id))); self.facet_field_string_docids.insert(key, RoaringBitmap::from_iter(Some(id)));
@ -311,8 +317,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
&mut self, &mut self,
words_pairs_proximities: impl IntoIterator<Item = ((&'a str, &'a str), u8)>, words_pairs_proximities: impl IntoIterator<Item = ((&'a str, &'a str), u8)>,
id: DocumentId, id: DocumentId,
) -> Result<()> ) -> Result<()> {
{
for ((w1, w2), prox) in words_pairs_proximities { for ((w1, w2), prox) in words_pairs_proximities {
let w1 = SmallVec32::from(w1.as_bytes()); let w1 = SmallVec32::from(w1.as_bytes());
let w2 = SmallVec32::from(w2.as_bytes()); let w2 = SmallVec32::from(w2.as_bytes());
@ -320,7 +325,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
// if get_refresh finds the element it is assured // if get_refresh finds the element it is assured
// to be at the end of the linked hash map. // to be at the end of the linked hash map.
match self.words_pairs_proximities_docids.get_refresh(&key) { match self.words_pairs_proximities_docids.get_refresh(&key) {
Some(old) => { old.insert(id); }, Some(old) => {
old.insert(id);
}
None => { None => {
// A newly inserted element is append at the end of the linked hash map. // A newly inserted element is append at the end of the linked hash map.
let ids = RoaringBitmap::from_iter(Some(id)); let ids = RoaringBitmap::from_iter(Some(id));
@ -337,7 +344,10 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
// Removing front elements is equivalent to removing the LRUs. // Removing front elements is equivalent to removing the LRUs.
let iter = iter::from_fn(|| self.words_pairs_proximities_docids.pop_front()); let iter = iter::from_fn(|| self.words_pairs_proximities_docids.pop_front());
iter.take(overflow).for_each(|x| lrus.push(x)); iter.take(overflow).for_each(|x| lrus.push(x));
Self::write_words_pairs_proximities(&mut self.words_pairs_proximities_docids_sorter, lrus)?; Self::write_words_pairs_proximities(
&mut self.words_pairs_proximities_docids_sorter,
lrus,
)?;
} }
Ok(()) Ok(())
@ -350,8 +360,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
facet_numbers_values: &mut HashMap<FieldId, Vec<f64>>, facet_numbers_values: &mut HashMap<FieldId, Vec<f64>>,
facet_strings_values: &mut HashMap<FieldId, Vec<String>>, facet_strings_values: &mut HashMap<FieldId, Vec<String>>,
record: &[u8], record: &[u8],
) -> Result<()> ) -> Result<()> {
{
// We compute the list of words pairs proximities (self-join) and write it directly to disk. // We compute the list of words pairs proximities (self-join) and write it directly to disk.
let words_pair_proximities = compute_words_pair_proximities(&words_positions); let words_pair_proximities = compute_words_pair_proximities(&words_positions);
self.insert_words_pairs_proximities_docids(words_pair_proximities, document_id)?; self.insert_words_pairs_proximities_docids(words_pair_proximities, document_id)?;
@ -362,8 +371,16 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
} }
self.documents_writer.insert(document_id.to_be_bytes(), record)?; self.documents_writer.insert(document_id.to_be_bytes(), record)?;
Self::write_docid_word_positions(&mut self.docid_word_positions_writer, document_id, words_positions)?; Self::write_docid_word_positions(
Self::write_word_position_docids(&mut self.word_level_position_docids_sorter, document_id, words_positions)?; &mut self.docid_word_positions_writer,
document_id,
words_positions,
)?;
Self::write_word_position_docids(
&mut self.word_level_position_docids_sorter,
document_id,
words_positions,
)?;
words_positions.clear(); words_positions.clear();
@ -419,8 +436,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
writer: &mut Writer<File>, writer: &mut Writer<File>,
id: DocumentId, id: DocumentId,
words_positions: &HashMap<String, SmallVec32<Position>>, words_positions: &HashMap<String, SmallVec32<Position>>,
) -> Result<()> ) -> Result<()> {
{
// We prefix the words by the document id. // We prefix the words by the document id.
let mut key = id.to_be_bytes().to_vec(); let mut key = id.to_be_bytes().to_vec();
let mut buffer = Vec::new(); let mut buffer = Vec::new();
@ -484,10 +500,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
Ok(()) Ok(())
} }
fn write_facet_field_string_docids<I, E>( fn write_facet_field_string_docids<I, E>(sorter: &mut Sorter<MergeFn<E>>, iter: I) -> Result<()>
sorter: &mut Sorter<MergeFn<E>>,
iter: I,
) -> Result<()>
where where
I: IntoIterator<Item = ((FieldId, String), RoaringBitmap)>, I: IntoIterator<Item = ((FieldId, String), RoaringBitmap)>,
Error: From<E>, Error: From<E>,
@ -510,10 +523,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
Ok(()) Ok(())
} }
fn write_facet_field_number_docids<I, E>( fn write_facet_field_number_docids<I, E>(sorter: &mut Sorter<MergeFn<E>>, iter: I) -> Result<()>
sorter: &mut Sorter<MergeFn<E>>,
iter: I,
) -> Result<()>
where where
I: IntoIterator<Item = ((FieldId, OrderedFloat<f64>), RoaringBitmap)>, I: IntoIterator<Item = ((FieldId, OrderedFloat<f64>), RoaringBitmap)>,
Error: From<E>, Error: From<E>,
@ -611,7 +621,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
log_every_n: Option<usize>, log_every_n: Option<usize>,
mut progress_callback: F, mut progress_callback: F,
) -> Result<Readers> ) -> Result<Readers>
where F: FnMut(UpdateIndexingStep), where
F: FnMut(UpdateIndexingStep),
{ {
debug!("{:?}: Indexing in a Store...", thread_index); debug!("{:?}: Indexing in a Store...", thread_index);
@ -629,7 +640,11 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
if count % num_threads == thread_index { if count % num_threads == thread_index {
// This is a log routine that we do every `log_every_n` documents. // This is a log routine that we do every `log_every_n` documents.
if thread_index == 0 && log_every_n.map_or(false, |len| count % len == 0) { if thread_index == 0 && log_every_n.map_or(false, |len| count % len == 0) {
info!("We have seen {} documents so far ({:.02?}).", format_count(count), before.elapsed()); info!(
"We have seen {} documents so far ({:.02?}).",
format_count(count),
before.elapsed()
);
progress_callback(UpdateIndexingStep::IndexDocuments { progress_callback(UpdateIndexingStep::IndexDocuments {
documents_seen: count, documents_seen: count,
total_documents: documents_count, total_documents: documents_count,
@ -638,12 +653,20 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
} }
for (attr, content) in document.iter() { for (attr, content) in document.iter() {
if self.faceted_fields.contains(&attr) || self.searchable_fields.contains(&attr) { if self.faceted_fields.contains(&attr) || self.searchable_fields.contains(&attr)
let value = serde_json::from_slice(content).map_err(InternalError::SerdeJson)?; {
let value =
serde_json::from_slice(content).map_err(InternalError::SerdeJson)?;
let (facet_numbers, facet_strings) = extract_facet_values(&value); let (facet_numbers, facet_strings) = extract_facet_values(&value);
facet_numbers_values.entry(attr).or_insert_with(Vec::new).extend(facet_numbers); facet_numbers_values
facet_strings_values.entry(attr).or_insert_with(Vec::new).extend(facet_strings); .entry(attr)
.or_insert_with(Vec::new)
.extend(facet_numbers);
facet_strings_values
.entry(attr)
.or_insert_with(Vec::new)
.extend(facet_strings);
if self.searchable_fields.contains(&attr) { if self.searchable_fields.contains(&attr) {
let content = match json_to_string(&value) { let content = match json_to_string(&value) {
@ -658,12 +681,18 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
for (pos, token) in tokens.take_while(|(pos, _)| *pos < MAX_POSITION) { for (pos, token) in tokens.take_while(|(pos, _)| *pos < MAX_POSITION) {
last_pos = Some(pos); last_pos = Some(pos);
let position = (attr as usize * MAX_POSITION + pos) as u32; let position = (attr as usize * MAX_POSITION + pos) as u32;
words_positions.entry(token.text().to_string()).or_insert_with(SmallVec32::new).push(position); words_positions
.entry(token.text().to_string())
.or_insert_with(SmallVec32::new)
.push(position);
} }
if let Some(last_pos) = last_pos.filter(|p| *p <= 10) { if let Some(last_pos) = last_pos.filter(|p| *p <= 10) {
let key = (attr, last_pos as u8 + 1); let key = (attr, last_pos as u8 + 1);
self.field_id_word_count_docids.entry(key).or_insert_with(RoaringBitmap::new).insert(document_id); self.field_id_word_count_docids
.entry(key)
.or_insert_with(RoaringBitmap::new)
.insert(document_id);
} }
} }
} }
@ -713,7 +742,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
self.facet_field_string_docids, self.facet_field_string_docids,
)?; )?;
let mut word_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; let mut word_docids_wtr =
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
let mut builder = fst::SetBuilder::memory(); let mut builder = fst::SetBuilder::memory();
let mut iter = self.word_docids_sorter.into_iter()?; let mut iter = self.word_docids_sorter.into_iter()?;
@ -737,37 +767,55 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
let mut main_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; let mut main_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.main_sorter.write_into(&mut main_wtr)?; self.main_sorter.write_into(&mut main_wtr)?;
let mut words_pairs_proximities_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; let mut words_pairs_proximities_docids_wtr =
self.words_pairs_proximities_docids_sorter.write_into(&mut words_pairs_proximities_docids_wtr)?; tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.words_pairs_proximities_docids_sorter
.write_into(&mut words_pairs_proximities_docids_wtr)?;
let mut word_level_position_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; let mut word_level_position_docids_wtr =
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?; self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?;
let mut field_id_word_count_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; let mut field_id_word_count_docids_wtr =
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.field_id_word_count_docids_sorter.write_into(&mut field_id_word_count_docids_wtr)?; self.field_id_word_count_docids_sorter.write_into(&mut field_id_word_count_docids_wtr)?;
let mut facet_field_numbers_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; let mut facet_field_numbers_docids_wtr =
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.facet_field_numbers_docids_sorter.write_into(&mut facet_field_numbers_docids_wtr)?; self.facet_field_numbers_docids_sorter.write_into(&mut facet_field_numbers_docids_wtr)?;
let mut facet_field_strings_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; let mut facet_field_strings_docids_wtr =
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.facet_field_strings_docids_sorter.write_into(&mut facet_field_strings_docids_wtr)?; self.facet_field_strings_docids_sorter.write_into(&mut facet_field_strings_docids_wtr)?;
let mut field_id_docid_facet_numbers_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; let mut field_id_docid_facet_numbers_wtr =
self.field_id_docid_facet_numbers_sorter.write_into(&mut field_id_docid_facet_numbers_wtr)?; tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.field_id_docid_facet_numbers_sorter
.write_into(&mut field_id_docid_facet_numbers_wtr)?;
let mut field_id_docid_facet_strings_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; let mut field_id_docid_facet_strings_wtr =
self.field_id_docid_facet_strings_sorter.write_into(&mut field_id_docid_facet_strings_wtr)?; tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.field_id_docid_facet_strings_sorter
.write_into(&mut field_id_docid_facet_strings_wtr)?;
let main = writer_into_reader(main_wtr, shrink_size)?; let main = writer_into_reader(main_wtr, shrink_size)?;
let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?; let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?;
let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?; let words_pairs_proximities_docids =
let word_level_position_docids = writer_into_reader(word_level_position_docids_wtr, shrink_size)?; writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?;
let field_id_word_count_docids = writer_into_reader(field_id_word_count_docids_wtr, shrink_size)?; let word_level_position_docids =
let facet_field_numbers_docids = writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?; writer_into_reader(word_level_position_docids_wtr, shrink_size)?;
let facet_field_strings_docids = writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?; let field_id_word_count_docids =
let field_id_docid_facet_numbers = writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?; writer_into_reader(field_id_word_count_docids_wtr, shrink_size)?;
let field_id_docid_facet_strings = writer_into_reader(field_id_docid_facet_strings_wtr, shrink_size)?; let facet_field_numbers_docids =
let docid_word_positions = writer_into_reader(self.docid_word_positions_writer, shrink_size)?; writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?;
let facet_field_strings_docids =
writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?;
let field_id_docid_facet_numbers =
writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?;
let field_id_docid_facet_strings =
writer_into_reader(field_id_docid_facet_strings_wtr, shrink_size)?;
let docid_word_positions =
writer_into_reader(self.docid_word_positions_writer, shrink_size)?;
let documents = writer_into_reader(self.documents_writer, shrink_size)?; let documents = writer_into_reader(self.documents_writer, shrink_size)?;
Ok(Readers { Ok(Readers {
@ -792,8 +840,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
/// close to each other. /// close to each other.
fn compute_words_pair_proximities( fn compute_words_pair_proximities(
word_positions: &HashMap<String, SmallVec32<Position>>, word_positions: &HashMap<String, SmallVec32<Position>>,
) -> HashMap<(&str, &str), u8> ) -> HashMap<(&str, &str), u8> {
{
use itertools::Itertools; use itertools::Itertools;
let mut words_pair_proximities = HashMap::new(); let mut words_pair_proximities = HashMap::new();
@ -828,7 +875,9 @@ fn lmdb_key_valid_size(key: &[u8]) -> bool {
/// take an iterator on tokens and compute their relative position depending on separator kinds /// take an iterator on tokens and compute their relative position depending on separator kinds
/// if it's an `Hard` separator we add an additional relative proximity of 8 between words, /// if it's an `Hard` separator we add an additional relative proximity of 8 between words,
/// else we keep the standart proximity of 1 between words. /// else we keep the standart proximity of 1 between words.
fn process_tokens<'a>(tokens: impl Iterator<Item = Token<'a>>) -> impl Iterator<Item = (usize, Token<'a>)> { fn process_tokens<'a>(
tokens: impl Iterator<Item = Token<'a>>,
) -> impl Iterator<Item = (usize, Token<'a>)> {
tokens tokens
.skip_while(|token| token.is_separator().is_some()) .skip_while(|token| token.is_separator().is_some())
.scan((0, None), |(offset, prev_kind), token| { .scan((0, None), |(offset, prev_kind), token| {
@ -845,7 +894,8 @@ fn process_tokens<'a>(tokens: impl Iterator<Item = Token<'a>>) -> impl Iterator<
*prev_kind = Some(token.kind); *prev_kind = Some(token.kind);
} }
TokenKind::Separator(SeparatorKind::Soft) TokenKind::Separator(SeparatorKind::Soft)
if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => { if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) =>
{
*prev_kind = Some(token.kind); *prev_kind = Some(token.kind);
} }
_ => (), _ => (),
@ -865,18 +915,22 @@ fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<String>) {
match value { match value {
Value::Null => (), Value::Null => (),
Value::Bool(b) => output_strings.push(b.to_string()), Value::Bool(b) => output_strings.push(b.to_string()),
Value::Number(number) => if let Some(float) = number.as_f64() { Value::Number(number) => {
if let Some(float) = number.as_f64() {
output_numbers.push(float); output_numbers.push(float);
}, }
}
Value::String(string) => { Value::String(string) => {
let string = string.trim().to_lowercase(); let string = string.trim().to_lowercase();
output_strings.push(string); output_strings.push(string);
}, }
Value::Array(values) => if can_recurse { Value::Array(values) => {
if can_recurse {
for value in values { for value in values {
inner_extract_facet_values(value, false, output_numbers, output_strings); inner_extract_facet_values(value, false, output_numbers, output_strings);
} }
}, }
}
Value::Object(_) => (), Value::Object(_) => (),
} }
} }

View File

@ -10,14 +10,15 @@ use log::info;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use serde_json::{Map, Value}; use serde_json::{Map, Value};
use crate::error::{Error, UserError, InternalError};
use crate::index::db_name;
use crate::update::index_documents::merge_function::{merge_obkvs, keep_latest_obkv};
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
use crate::{BEU32, MergeFn, FieldsIdsMap, ExternalDocumentsIds, FieldId, FieldsDistribution};
use crate::{Index, Result};
use super::merge_function::merge_two_obkvs; use super::merge_function::merge_two_obkvs;
use super::{create_writer, create_sorter, IndexDocumentsMethod}; use super::{create_sorter, create_writer, IndexDocumentsMethod};
use crate::error::{Error, InternalError, UserError};
use crate::index::db_name;
use crate::update::index_documents::merge_function::{keep_latest_obkv, merge_obkvs};
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
use crate::{
ExternalDocumentsIds, FieldId, FieldsDistribution, FieldsIdsMap, Index, MergeFn, Result, BEU32,
};
const DEFAULT_PRIMARY_KEY_NAME: &str = "id"; const DEFAULT_PRIMARY_KEY_NAME: &str = "id";
@ -64,7 +65,11 @@ impl Transform<'_, '_> {
self.output_from_generic_json(reader, false, progress_callback) self.output_from_generic_json(reader, false, progress_callback)
} }
pub fn output_from_json_stream<R, F>(self, reader: R, progress_callback: F) -> Result<TransformOutput> pub fn output_from_json_stream<R, F>(
self,
reader: R,
progress_callback: F,
) -> Result<TransformOutput>
where where
R: Read, R: Read,
F: Fn(UpdateIndexingStep) + Sync, F: Fn(UpdateIndexingStep) + Sync,
@ -86,7 +91,9 @@ impl Transform<'_, '_> {
let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap(); let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap();
// Deserialize the whole batch of documents in memory. // Deserialize the whole batch of documents in memory.
let mut documents: Peekable<Box<dyn Iterator<Item=serde_json::Result<Map<String, Value>>>>> = if is_stream { let mut documents: Peekable<
Box<dyn Iterator<Item = serde_json::Result<Map<String, Value>>>>,
> = if is_stream {
let iter = serde_json::Deserializer::from_reader(reader).into_iter(); let iter = serde_json::Deserializer::from_reader(reader).into_iter();
let iter = Box::new(iter) as Box<dyn Iterator<Item = _>>; let iter = Box::new(iter) as Box<dyn Iterator<Item = _>>;
iter.peekable() iter.peekable()
@ -104,15 +111,16 @@ impl Transform<'_, '_> {
Err(_) => { Err(_) => {
let error = documents.next().unwrap().unwrap_err(); let error = documents.next().unwrap().unwrap_err();
return Err(UserError::SerdeJson(error).into()); return Err(UserError::SerdeJson(error).into());
}, }
}; };
let alternative_name = first.and_then(|doc| doc.keys().find(|f| is_primary_key(f)).cloned()); let alternative_name =
first.and_then(|doc| doc.keys().find(|f| is_primary_key(f)).cloned());
let (primary_key_id, primary_key) = compute_primary_key_pair( let (primary_key_id, primary_key) = compute_primary_key_pair(
self.index.primary_key(self.rtxn)?, self.index.primary_key(self.rtxn)?,
&mut fields_ids_map, &mut fields_ids_map,
alternative_name, alternative_name,
self.autogenerate_docids self.autogenerate_docids,
)?; )?;
if documents.peek().is_none() { if documents.peek().is_none() {
@ -173,9 +181,11 @@ impl Transform<'_, '_> {
Some(value) => match value { Some(value) => match value {
Value::String(string) => Cow::Borrowed(string.as_str()), Value::String(string) => Cow::Borrowed(string.as_str()),
Value::Number(number) => Cow::Owned(number.to_string()), Value::Number(number) => Cow::Owned(number.to_string()),
content => return Err(UserError::InvalidDocumentId { content => {
document_id: content.clone(), return Err(
}.into()), UserError::InvalidDocumentId { document_id: content.clone() }.into()
)
}
}, },
None => { None => {
if !self.autogenerate_docids { if !self.autogenerate_docids {
@ -183,7 +193,7 @@ impl Transform<'_, '_> {
} }
let uuid = uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer); let uuid = uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer);
Cow::Borrowed(uuid) Cow::Borrowed(uuid)
}, }
}; };
// We iterate in the fields ids ordered. // We iterate in the fields ids ordered.
@ -194,7 +204,8 @@ impl Transform<'_, '_> {
// and this should be the document id we return the one we generated. // and this should be the document id we return the one we generated.
if let Some(value) = document.get(name) { if let Some(value) = document.get(name) {
// We serialize the attribute values. // We serialize the attribute values.
serde_json::to_writer(&mut json_buffer, value).map_err(InternalError::SerdeJson)?; serde_json::to_writer(&mut json_buffer, value)
.map_err(InternalError::SerdeJson)?;
writer.insert(field_id, &json_buffer)?; writer.insert(field_id, &json_buffer)?;
} }
@ -202,7 +213,8 @@ impl Transform<'_, '_> {
if field_id == primary_key_id && validate_document_id(&external_id).is_none() { if field_id == primary_key_id && validate_document_id(&external_id).is_none() {
return Err(UserError::InvalidDocumentId { return Err(UserError::InvalidDocumentId {
document_id: Value::from(external_id), document_id: Value::from(external_id),
}.into()); }
.into());
} }
} }
@ -250,7 +262,7 @@ impl Transform<'_, '_> {
Some(primary_key) => { Some(primary_key) => {
// The primary key is known so we must find the position in the CSV headers. // The primary key is known so we must find the position in the CSV headers.
headers.iter().position(|h| h == primary_key) headers.iter().position(|h| h == primary_key)
}, }
None => headers.iter().position(is_primary_key), None => headers.iter().position(is_primary_key),
}; };
@ -261,7 +273,7 @@ impl Transform<'_, '_> {
self.index.primary_key(self.rtxn)?, self.index.primary_key(self.rtxn)?,
&mut fields_ids_map, &mut fields_ids_map,
alternative_name, alternative_name,
self.autogenerate_docids self.autogenerate_docids,
)?; )?;
// The primary key field is not present in the header, so we need to create it. // The primary key field is not present in the header, so we need to create it.
@ -308,18 +320,20 @@ impl Transform<'_, '_> {
// We validate the document id [a-zA-Z0-9\-_]. // We validate the document id [a-zA-Z0-9\-_].
match validate_document_id(&external_id) { match validate_document_id(&external_id) {
Some(valid) => valid, Some(valid) => valid,
None => return Err(UserError::InvalidDocumentId { None => {
return Err(UserError::InvalidDocumentId {
document_id: Value::from(external_id), document_id: Value::from(external_id),
}.into()),
} }
}, .into())
}
}
}
None => uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer), None => uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer),
}; };
// When the primary_key_field_id is found in the fields ids list // When the primary_key_field_id is found in the fields ids list
// we return the generated document id instead of the record field. // we return the generated document id instead of the record field.
let iter = fields_ids.iter() let iter = fields_ids.iter().map(|(fi, i)| {
.map(|(fi, i)| {
let field = if *fi == primary_key_id { external_id } else { &record[*i] }; let field = if *fi == primary_key_id { external_id } else { &record[*i] };
(fi, field) (fi, field)
}); });
@ -328,7 +342,8 @@ impl Transform<'_, '_> {
for (field_id, field) in iter { for (field_id, field) in iter {
// We serialize the attribute values as JSON strings. // We serialize the attribute values as JSON strings.
json_buffer.clear(); json_buffer.clear();
serde_json::to_writer(&mut json_buffer, &field).map_err(InternalError::SerdeJson)?; serde_json::to_writer(&mut json_buffer, &field)
.map_err(InternalError::SerdeJson)?;
writer.insert(*field_id, &json_buffer)?; writer.insert(*field_id, &json_buffer)?;
} }
@ -410,26 +425,27 @@ impl Transform<'_, '_> {
IndexDocumentsMethod::ReplaceDocuments => (docid, update_obkv), IndexDocumentsMethod::ReplaceDocuments => (docid, update_obkv),
IndexDocumentsMethod::UpdateDocuments => { IndexDocumentsMethod::UpdateDocuments => {
let key = BEU32::new(docid); let key = BEU32::new(docid);
let base_obkv = self.index.documents.get(&self.rtxn, &key)? let base_obkv = self.index.documents.get(&self.rtxn, &key)?.ok_or(
.ok_or(InternalError::DatabaseMissingEntry { InternalError::DatabaseMissingEntry {
db_name: db_name::DOCUMENTS, db_name: db_name::DOCUMENTS,
key: None, key: None,
})?; },
)?;
let update_obkv = obkv::KvReader::new(update_obkv); let update_obkv = obkv::KvReader::new(update_obkv);
merge_two_obkvs(base_obkv, update_obkv, &mut obkv_buffer); merge_two_obkvs(base_obkv, update_obkv, &mut obkv_buffer);
(docid, obkv_buffer.as_slice()) (docid, obkv_buffer.as_slice())
} }
} }
}, }
None => { None => {
// If this user id is new we add it to the external documents ids map // If this user id is new we add it to the external documents ids map
// for new ids and into the list of new documents. // for new ids and into the list of new documents.
let new_docid = available_documents_ids.next() let new_docid =
.ok_or(UserError::DocumentLimitReached)?; available_documents_ids.next().ok_or(UserError::DocumentLimitReached)?;
new_external_documents_ids_builder.insert(external_id, new_docid as u64)?; new_external_documents_ids_builder.insert(external_id, new_docid as u64)?;
new_documents_ids.insert(new_docid); new_documents_ids.insert(new_docid);
(new_docid, update_obkv) (new_docid, update_obkv)
}, }
}; };
// We insert the document under the documents ids map into the final file. // We insert the document under the documents ids map into the final file.
@ -450,7 +466,8 @@ impl Transform<'_, '_> {
// We create a final writer to write the new documents in order from the sorter. // We create a final writer to write the new documents in order from the sorter.
let file = tempfile::tempfile()?; let file = tempfile::tempfile()?;
let mut writer = create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?; let mut writer =
create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?;
// Once we have written all the documents into the final sorter, we write the documents // Once we have written all the documents into the final sorter, we write the documents
// into this writer, extract the file and reset the seek to be able to read it again. // into this writer, extract the file and reset the seek to be able to read it again.
@ -485,8 +502,7 @@ impl Transform<'_, '_> {
primary_key: String, primary_key: String,
old_fields_ids_map: FieldsIdsMap, old_fields_ids_map: FieldsIdsMap,
new_fields_ids_map: FieldsIdsMap, new_fields_ids_map: FieldsIdsMap,
) -> Result<TransformOutput> ) -> Result<TransformOutput> {
{
let fields_distribution = self.index.fields_distribution(self.rtxn)?; let fields_distribution = self.index.fields_distribution(self.rtxn)?;
let external_documents_ids = self.index.external_documents_ids(self.rtxn)?; let external_documents_ids = self.index.external_documents_ids(self.rtxn)?;
let documents_ids = self.index.documents_ids(self.rtxn)?; let documents_ids = self.index.documents_ids(self.rtxn)?;
@ -494,7 +510,8 @@ impl Transform<'_, '_> {
// We create a final writer to write the new documents in order from the sorter. // We create a final writer to write the new documents in order from the sorter.
let file = tempfile::tempfile()?; let file = tempfile::tempfile()?;
let mut writer = create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?; let mut writer =
create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?;
let mut obkv_buffer = Vec::new(); let mut obkv_buffer = Vec::new();
for result in self.index.documents.iter(self.rtxn)? { for result in self.index.documents.iter(self.rtxn)? {
@ -561,20 +578,19 @@ fn compute_primary_key_pair(
return Err(UserError::MissingPrimaryKey.into()); return Err(UserError::MissingPrimaryKey.into());
} }
DEFAULT_PRIMARY_KEY_NAME.to_string() DEFAULT_PRIMARY_KEY_NAME.to_string()
}, }
}; };
let id = fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?; let id = fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?;
Ok((id, name)) Ok((id, name))
}, }
} }
} }
fn validate_document_id(document_id: &str) -> Option<&str> { fn validate_document_id(document_id: &str) -> Option<&str> {
let document_id = document_id.trim(); let document_id = document_id.trim();
Some(document_id).filter(|id| { Some(document_id).filter(|id| {
!id.is_empty() && id.chars().all(|c| { !id.is_empty()
matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_') && id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_'))
})
}) })
} }
@ -583,8 +599,7 @@ mod test {
use super::*; use super::*;
mod compute_primary_key { mod compute_primary_key {
use super::compute_primary_key_pair; use super::{compute_primary_key_pair, FieldsIdsMap};
use super::FieldsIdsMap;
#[test] #[test]
fn should_return_primary_key_if_is_some() { fn should_return_primary_key_if_is_some() {
@ -594,7 +609,8 @@ mod test {
Some("toto"), Some("toto"),
&mut fields_map, &mut fields_map,
Some("tata".to_string()), Some("tata".to_string()),
false); false,
);
assert_eq!(result.unwrap(), (0u8, "toto".to_string())); assert_eq!(result.unwrap(), (0u8, "toto".to_string()));
assert_eq!(fields_map.len(), 1); assert_eq!(fields_map.len(), 1);
} }
@ -602,11 +618,8 @@ mod test {
#[test] #[test]
fn should_return_alternative_if_primary_is_none() { fn should_return_alternative_if_primary_is_none() {
let mut fields_map = FieldsIdsMap::new(); let mut fields_map = FieldsIdsMap::new();
let result = compute_primary_key_pair( let result =
None, compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false);
&mut fields_map,
Some("tata".to_string()),
false);
assert_eq!(result.unwrap(), (0u8, "tata".to_string())); assert_eq!(result.unwrap(), (0u8, "tata".to_string()));
assert_eq!(fields_map.len(), 1); assert_eq!(fields_map.len(), 1);
} }
@ -614,11 +627,7 @@ mod test {
#[test] #[test]
fn should_return_default_if_both_are_none() { fn should_return_default_if_both_are_none() {
let mut fields_map = FieldsIdsMap::new(); let mut fields_map = FieldsIdsMap::new();
let result = compute_primary_key_pair( let result = compute_primary_key_pair(None, &mut fields_map, None, true);
None,
&mut fields_map,
None,
true);
assert_eq!(result.unwrap(), (0u8, "id".to_string())); assert_eq!(result.unwrap(), (0u8, "id".to_string()));
assert_eq!(fields_map.len(), 1); assert_eq!(fields_map.len(), 1);
} }
@ -626,11 +635,7 @@ mod test {
#[test] #[test]
fn should_return_err_if_both_are_none_and_recompute_is_false() { fn should_return_err_if_both_are_none_and_recompute_is_false() {
let mut fields_map = FieldsIdsMap::new(); let mut fields_map = FieldsIdsMap::new();
let result = compute_primary_key_pair( let result = compute_primary_key_pair(None, &mut fields_map, None, false);
None,
&mut fields_map,
None,
false);
assert!(result.is_err()); assert!(result.is_err());
assert_eq!(fields_map.len(), 0); assert_eq!(fields_map.len(), 0);
} }

View File

@ -2,7 +2,9 @@ pub use self::available_documents_ids::AvailableDocumentsIds;
pub use self::clear_documents::ClearDocuments; pub use self::clear_documents::ClearDocuments;
pub use self::delete_documents::DeleteDocuments; pub use self::delete_documents::DeleteDocuments;
pub use self::facets::Facets; pub use self::facets::Facets;
pub use self::index_documents::{DocumentAdditionResult, IndexDocuments, IndexDocumentsMethod, UpdateFormat}; pub use self::index_documents::{
DocumentAdditionResult, IndexDocuments, IndexDocumentsMethod, UpdateFormat,
};
pub use self::settings::{Setting, Settings}; pub use self::settings::{Setting, Settings};
pub use self::update_builder::UpdateBuilder; pub use self::update_builder::UpdateBuilder;
pub use self::update_step::UpdateIndexingStep; pub use self::update_step::UpdateIndexingStep;

View File

@ -34,17 +34,24 @@ impl<T> Setting<T> {
} }
impl<T: Serialize> Serialize for Setting<T> { impl<T: Serialize> Serialize for Setting<T> {
fn serialize<S>(&self, serializer: S) -> StdResult<S::Ok, S::Error> where S: Serializer { fn serialize<S>(&self, serializer: S) -> StdResult<S::Ok, S::Error>
where
S: Serializer,
{
match self { match self {
Self::Set(value) => Some(value), Self::Set(value) => Some(value),
// Usually not_set isn't serialized by setting skip_serializing_if field attribute // Usually not_set isn't serialized by setting skip_serializing_if field attribute
Self::NotSet | Self::Reset => None, Self::NotSet | Self::Reset => None,
}.serialize(serializer) }
.serialize(serializer)
} }
} }
impl<'de, T: Deserialize<'de>> Deserialize<'de> for Setting<T> { impl<'de, T: Deserialize<'de>> Deserialize<'de> for Setting<T> {
fn deserialize<D>(deserializer: D) -> StdResult<Self, D::Error> where D: Deserializer<'de> { fn deserialize<D>(deserializer: D) -> StdResult<Self, D::Error>
where
D: Deserializer<'de>,
{
Deserialize::deserialize(deserializer).map(|x| match x { Deserialize::deserialize(deserializer).map(|x| match x {
Some(x) => Self::Set(x), Some(x) => Self::Set(x),
None => Self::Reset, // Reset is forced by sending null value None => Self::Reset, // Reset is forced by sending null value
@ -141,11 +148,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
} }
pub fn set_stop_words(&mut self, stop_words: BTreeSet<String>) { pub fn set_stop_words(&mut self, stop_words: BTreeSet<String>) {
self.stop_words = if stop_words.is_empty() { self.stop_words =
Setting::Reset if stop_words.is_empty() { Setting::Reset } else { Setting::Set(stop_words) }
} else {
Setting::Set(stop_words)
}
} }
pub fn reset_distinct_field(&mut self) { pub fn reset_distinct_field(&mut self) {
@ -161,11 +165,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
} }
pub fn set_synonyms(&mut self, synonyms: HashMap<String, Vec<String>>) { pub fn set_synonyms(&mut self, synonyms: HashMap<String, Vec<String>>) {
self.synonyms = if synonyms.is_empty() { self.synonyms = if synonyms.is_empty() { Setting::Reset } else { Setting::Set(synonyms) }
Setting::Reset
} else {
Setting::Set(synonyms)
}
} }
pub fn reset_primary_key(&mut self) { pub fn reset_primary_key(&mut self) {
@ -178,7 +178,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
fn reindex<F>(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()> fn reindex<F>(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()>
where where
F: Fn(UpdateIndexingStep, u64) + Sync F: Fn(UpdateIndexingStep, u64) + Sync,
{ {
let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; let fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
let update_id = self.update_id; let update_id = self.update_id;
@ -203,7 +203,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
}; };
// There already has been a document addition, the primary key should be set by now. // There already has been a document addition, the primary key should be set by now.
let primary_key = self.index.primary_key(&self.wtxn)?.ok_or(UserError::MissingPrimaryKey)?; let primary_key =
self.index.primary_key(&self.wtxn)?.ok_or(UserError::MissingPrimaryKey)?;
// We remap the documents fields based on the new `FieldsIdsMap`. // We remap the documents fields based on the new `FieldsIdsMap`.
let output = transform.remap_index_documents( let output = transform.remap_index_documents(
@ -236,21 +237,17 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
Setting::Set(ref fields) => { Setting::Set(ref fields) => {
let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
// fields are deduplicated, only the first occurrence is taken into account // fields are deduplicated, only the first occurrence is taken into account
let names: Vec<_> = fields let names: Vec<_> = fields.iter().unique().map(String::as_str).collect();
.iter()
.unique()
.map(String::as_str)
.collect();
for name in names.iter() { for name in names.iter() {
fields_ids_map fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?;
.insert(name)
.ok_or(UserError::AttributeLimitReached)?;
} }
self.index.put_displayed_fields(self.wtxn, &names)?; self.index.put_displayed_fields(self.wtxn, &names)?;
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
} }
Setting::Reset => { self.index.delete_displayed_fields(self.wtxn)?; } Setting::Reset => {
self.index.delete_displayed_fields(self.wtxn)?;
}
Setting::NotSet => return Ok(false), Setting::NotSet => return Ok(false),
} }
Ok(true) Ok(true)
@ -260,14 +257,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
match self.distinct_field { match self.distinct_field {
Setting::Set(ref attr) => { Setting::Set(ref attr) => {
let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
fields_ids_map fields_ids_map.insert(attr).ok_or(UserError::AttributeLimitReached)?;
.insert(attr)
.ok_or(UserError::AttributeLimitReached)?;
self.index.put_distinct_field(self.wtxn, &attr)?; self.index.put_distinct_field(self.wtxn, &attr)?;
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
} }
Setting::Reset => { self.index.delete_distinct_field(self.wtxn)?; }, Setting::Reset => {
self.index.delete_distinct_field(self.wtxn)?;
}
Setting::NotSet => return Ok(false), Setting::NotSet => return Ok(false),
} }
Ok(true) Ok(true)
@ -285,30 +282,24 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
let mut new_fields_ids_map = FieldsIdsMap::new(); let mut new_fields_ids_map = FieldsIdsMap::new();
// fields are deduplicated, only the first occurrence is taken into account // fields are deduplicated, only the first occurrence is taken into account
let names = fields let names = fields.iter().unique().map(String::as_str).collect::<Vec<_>>();
.iter()
.unique()
.map(String::as_str)
.collect::<Vec<_>>();
// Add all the searchable attributes to the field map, and then add the // Add all the searchable attributes to the field map, and then add the
// remaining fields from the old field map to the new one // remaining fields from the old field map to the new one
for name in names.iter() { for name in names.iter() {
new_fields_ids_map new_fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?;
.insert(&name)
.ok_or(UserError::AttributeLimitReached)?;
} }
for (_, name) in old_fields_ids_map.iter() { for (_, name) in old_fields_ids_map.iter() {
new_fields_ids_map new_fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?;
.insert(&name)
.ok_or(UserError::AttributeLimitReached)?;
} }
self.index.put_searchable_fields(self.wtxn, &names)?; self.index.put_searchable_fields(self.wtxn, &names)?;
self.index.put_fields_ids_map(self.wtxn, &new_fields_ids_map)?; self.index.put_fields_ids_map(self.wtxn, &new_fields_ids_map)?;
} }
Setting::Reset => { self.index.delete_searchable_fields(self.wtxn)?; } Setting::Reset => {
self.index.delete_searchable_fields(self.wtxn)?;
}
Setting::NotSet => return Ok(false), Setting::NotSet => return Ok(false),
} }
Ok(true) Ok(true)
@ -323,7 +314,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
let fst = fst::Set::from_iter(stop_words)?; let fst = fst::Set::from_iter(stop_words)?;
// Does the new FST differ from the previous one? // Does the new FST differ from the previous one?
if current.map_or(true, |current| current.as_fst().as_bytes() != fst.as_fst().as_bytes()) { if current
.map_or(true, |current| current.as_fst().as_bytes() != fst.as_fst().as_bytes())
{
// we want to re-create our FST. // we want to re-create our FST.
self.index.put_stop_words(self.wtxn, &fst)?; self.index.put_stop_words(self.wtxn, &fst)?;
Ok(true) Ok(true)
@ -343,9 +336,13 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
analyzer analyzer
.analyze(text) .analyze(text)
.tokens() .tokens()
.filter_map(|token| .filter_map(|token| {
if token.is_word() { Some(token.text().to_string()) } else { None } if token.is_word() {
) Some(token.text().to_string())
} else {
None
}
})
.collect::<Vec<_>>() .collect::<Vec<_>>()
} }
@ -360,22 +357,17 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
for (word, synonyms) in synonyms { for (word, synonyms) in synonyms {
// Normalize both the word and associated synonyms. // Normalize both the word and associated synonyms.
let normalized_word = normalize(&analyzer, word); let normalized_word = normalize(&analyzer, word);
let normalized_synonyms = synonyms let normalized_synonyms =
.iter() synonyms.iter().map(|synonym| normalize(&analyzer, synonym));
.map(|synonym| normalize(&analyzer, synonym));
// Store the normalized synonyms under the normalized word, // Store the normalized synonyms under the normalized word,
// merging the possible duplicate words. // merging the possible duplicate words.
let entry = new_synonyms let entry = new_synonyms.entry(normalized_word).or_insert_with(Vec::new);
.entry(normalized_word)
.or_insert_with(Vec::new);
entry.extend(normalized_synonyms); entry.extend(normalized_synonyms);
} }
// Make sure that we don't have duplicate synonyms. // Make sure that we don't have duplicate synonyms.
new_synonyms new_synonyms.iter_mut().for_each(|(_, synonyms)| {
.iter_mut()
.for_each(|(_, synonyms)| {
synonyms.sort_unstable(); synonyms.sort_unstable();
synonyms.dedup(); synonyms.dedup();
}); });
@ -406,7 +398,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
self.index.put_filterable_fields(self.wtxn, &new_facets)?; self.index.put_filterable_fields(self.wtxn, &new_facets)?;
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
} }
Setting::Reset => { self.index.delete_filterable_fields(self.wtxn)?; } Setting::Reset => {
self.index.delete_filterable_fields(self.wtxn)?;
}
Setting::NotSet => (), Setting::NotSet => (),
} }
Ok(()) Ok(())
@ -427,7 +421,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
self.index.put_criteria(self.wtxn, &new_criteria)?; self.index.put_criteria(self.wtxn, &new_criteria)?;
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
} }
Setting::Reset => { self.index.delete_criteria(self.wtxn)?; } Setting::Reset => {
self.index.delete_criteria(self.wtxn)?;
}
Setting::NotSet => (), Setting::NotSet => (),
} }
Ok(()) Ok(())
@ -445,7 +441,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
} else { } else {
Err(UserError::PrimaryKeyCannotBeChanged.into()) Err(UserError::PrimaryKeyCannotBeChanged.into())
} }
}, }
Setting::Reset => { Setting::Reset => {
if self.index.number_of_documents(&self.wtxn)? == 0 { if self.index.number_of_documents(&self.wtxn)? == 0 {
self.index.delete_primary_key(self.wtxn)?; self.index.delete_primary_key(self.wtxn)?;
@ -453,14 +449,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
} else { } else {
Err(UserError::PrimaryKeyCannotBeReset.into()) Err(UserError::PrimaryKeyCannotBeReset.into())
} }
}, }
Setting::NotSet => Ok(()), Setting::NotSet => Ok(()),
} }
} }
pub fn execute<F>(mut self, progress_callback: F) -> Result<()> pub fn execute<F>(mut self, progress_callback: F) -> Result<()>
where where
F: Fn(UpdateIndexingStep, u64) + Sync F: Fn(UpdateIndexingStep, u64) + Sync,
{ {
self.index.set_updated_at(self.wtxn, &Utc::now())?; self.index.set_updated_at(self.wtxn, &Utc::now())?;
@ -493,17 +489,16 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use heed::EnvOpenOptions;
use heed::types::ByteSlice;
use maplit::{btreeset, hashmap, hashset};
use big_s::S; use big_s::S;
use heed::types::ByteSlice;
use heed::EnvOpenOptions;
use maplit::{btreeset, hashmap, hashset};
use super::*;
use crate::error::Error; use crate::error::Error;
use crate::update::{IndexDocuments, UpdateFormat}; use crate::update::{IndexDocuments, UpdateFormat};
use crate::{Criterion, FilterCondition, SearchResult}; use crate::{Criterion, FilterCondition, SearchResult};
use super::*;
#[test] #[test]
fn set_and_reset_searchable_fields() { fn set_and_reset_searchable_fields() {
let path = tempfile::tempdir().unwrap(); let path = tempfile::tempdir().unwrap();
@ -695,9 +690,12 @@ mod tests {
assert_eq!(fields_ids, hashset! { S("age") }); assert_eq!(fields_ids, hashset! { S("age") });
// Only count the field_id 0 and level 0 facet values. // Only count the field_id 0 and level 0 facet values.
// TODO we must support typed CSVs for numbers to be understood. // TODO we must support typed CSVs for numbers to be understood.
let count = index.facet_id_f64_docids let count = index
.facet_id_f64_docids
.remap_key_type::<ByteSlice>() .remap_key_type::<ByteSlice>()
.prefix_iter(&rtxn, &[0, 0]).unwrap().count(); .prefix_iter(&rtxn, &[0, 0])
.unwrap()
.count();
assert_eq!(count, 3); assert_eq!(count, 3);
drop(rtxn); drop(rtxn);
@ -718,9 +716,12 @@ mod tests {
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
// Only count the field_id 0 and level 0 facet values. // Only count the field_id 0 and level 0 facet values.
// TODO we must support typed CSVs for numbers to be understood. // TODO we must support typed CSVs for numbers to be understood.
let count = index.facet_id_f64_docids let count = index
.facet_id_f64_docids
.remap_key_type::<ByteSlice>() .remap_key_type::<ByteSlice>()
.prefix_iter(&rtxn, &[0, 0]).unwrap().count(); .prefix_iter(&rtxn, &[0, 0])
.unwrap()
.count();
assert_eq!(count, 4); assert_eq!(count, 4);
} }

View File

@ -1,8 +1,8 @@
use grenad::CompressionType; use grenad::CompressionType;
use rayon::ThreadPool; use rayon::ThreadPool;
use super::{ClearDocuments, DeleteDocuments, Facets, IndexDocuments, Settings};
use crate::{Index, Result}; use crate::{Index, Result};
use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets};
pub struct UpdateBuilder<'a> { pub struct UpdateBuilder<'a> {
pub(crate) log_every_n: Option<usize>, pub(crate) log_every_n: Option<usize>,
@ -67,8 +67,7 @@ impl<'a> UpdateBuilder<'a> {
self, self,
wtxn: &'t mut heed::RwTxn<'i, 'u>, wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index, index: &'i Index,
) -> ClearDocuments<'t, 'u, 'i> ) -> ClearDocuments<'t, 'u, 'i> {
{
ClearDocuments::new(wtxn, index, self.update_id) ClearDocuments::new(wtxn, index, self.update_id)
} }
@ -76,8 +75,7 @@ impl<'a> UpdateBuilder<'a> {
self, self,
wtxn: &'t mut heed::RwTxn<'i, 'u>, wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index, index: &'i Index,
) -> Result<DeleteDocuments<'t, 'u, 'i>> ) -> Result<DeleteDocuments<'t, 'u, 'i>> {
{
DeleteDocuments::new(wtxn, index, self.update_id) DeleteDocuments::new(wtxn, index, self.update_id)
} }
@ -85,8 +83,7 @@ impl<'a> UpdateBuilder<'a> {
self, self,
wtxn: &'t mut heed::RwTxn<'i, 'u>, wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index, index: &'i Index,
) -> IndexDocuments<'t, 'u, 'i, 'a> ) -> IndexDocuments<'t, 'u, 'i, 'a> {
{
let mut builder = IndexDocuments::new(wtxn, index, self.update_id); let mut builder = IndexDocuments::new(wtxn, index, self.update_id);
builder.log_every_n = self.log_every_n; builder.log_every_n = self.log_every_n;
@ -105,8 +102,7 @@ impl<'a> UpdateBuilder<'a> {
self, self,
wtxn: &'t mut heed::RwTxn<'i, 'u>, wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index, index: &'i Index,
) -> Settings<'a, 't, 'u, 'i> ) -> Settings<'a, 't, 'u, 'i> {
{
let mut builder = Settings::new(wtxn, index, self.update_id); let mut builder = Settings::new(wtxn, index, self.update_id);
builder.log_every_n = self.log_every_n; builder.log_every_n = self.log_every_n;
@ -125,8 +121,7 @@ impl<'a> UpdateBuilder<'a> {
self, self,
wtxn: &'t mut heed::RwTxn<'i, 'u>, wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index, index: &'i Index,
) -> Facets<'t, 'u, 'i> ) -> Facets<'t, 'u, 'i> {
{
let mut builder = Facets::new(wtxn, index, self.update_id); let mut builder = Facets::new(wtxn, index, self.update_id);
builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_type = self.chunk_compression_type;

View File

@ -1,15 +1,13 @@
use std::str; use std::str;
use crate::Index;
use fst::Streamer; use fst::Streamer;
use grenad::CompressionType; use grenad::CompressionType;
use heed::types::ByteSlice; use heed::types::ByteSlice;
use crate::Result;
use crate::update::index_documents::WriteMethod;
use crate::update::index_documents::{ use crate::update::index_documents::{
create_sorter, roaring_bitmap_merge, sorter_into_lmdb_database, create_sorter, roaring_bitmap_merge, sorter_into_lmdb_database, WriteMethod,
}; };
use crate::{Index, Result};
pub struct WordPrefixDocids<'t, 'u, 'i> { pub struct WordPrefixDocids<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>, wtxn: &'t mut heed::RwTxn<'i, 'u>,
@ -22,7 +20,10 @@ pub struct WordPrefixDocids<'t, 'u, 'i> {
} }
impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> WordPrefixDocids<'t, 'u, 'i> { pub fn new(
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
) -> WordPrefixDocids<'t, 'u, 'i> {
WordPrefixDocids { WordPrefixDocids {
wtxn, wtxn,
index, index,

View File

@ -1,18 +1,17 @@
use std::str; use std::str;
use fst::automaton::{Automaton, Str}; use fst::automaton::{Automaton, Str};
use fst::{Streamer, IntoStreamer}; use fst::{IntoStreamer, Streamer};
use grenad::CompressionType; use grenad::CompressionType;
use heed::BytesEncode;
use heed::types::ByteSlice; use heed::types::ByteSlice;
use heed::BytesEncode;
use log::debug; use log::debug;
use crate::{Index, Result};
use crate::heed_codec::StrStrU8Codec; use crate::heed_codec::StrStrU8Codec;
use crate::update::index_documents::{ use crate::update::index_documents::{
WriteMethod, create_sorter, sorter_into_lmdb_database, cbo_roaring_bitmap_merge, create_sorter, sorter_into_lmdb_database, WriteMethod,
cbo_roaring_bitmap_merge,
}; };
use crate::{Index, Result};
pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>, wtxn: &'t mut heed::RwTxn<'i, 'u>,
@ -28,8 +27,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
pub fn new( pub fn new(
wtxn: &'t mut heed::RwTxn<'i, 'u>, wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index, index: &'i Index,
) -> WordPrefixPairProximityDocids<'t, 'u, 'i> ) -> WordPrefixPairProximityDocids<'t, 'u, 'i> {
{
WordPrefixPairProximityDocids { WordPrefixPairProximityDocids {
wtxn, wtxn,
index, index,

View File

@ -1,25 +1,23 @@
use std::{cmp, str};
use std::convert::TryFrom; use std::convert::TryFrom;
use std::fs::File; use std::fs::File;
use std::num::NonZeroU32; use std::num::NonZeroU32;
use std::{cmp, str};
use fst::automaton::{self, Automaton}; use fst::automaton::{self, Automaton};
use fst::{Streamer, IntoStreamer}; use fst::{IntoStreamer, Streamer};
use grenad::{CompressionType, Reader, Writer, FileFuse}; use grenad::{CompressionType, FileFuse, Reader, Writer};
use heed::types::{ByteSlice, DecodeIgnore, Str}; use heed::types::{ByteSlice, DecodeIgnore, Str};
use heed::{BytesEncode, Error}; use heed::{BytesEncode, Error};
use log::debug; use log::debug;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use crate::error::InternalError; use crate::error::InternalError;
use crate::heed_codec::{StrLevelPositionCodec, CboRoaringBitmapCodec}; use crate::heed_codec::{CboRoaringBitmapCodec, StrLevelPositionCodec};
use crate::Result;
use crate::update::index_documents::WriteMethod;
use crate::update::index_documents::{ use crate::update::index_documents::{
create_writer, create_sorter, writer_into_reader, write_into_lmdb_database, cbo_roaring_bitmap_merge, create_sorter, create_writer, sorter_into_lmdb_database,
cbo_roaring_bitmap_merge, sorter_into_lmdb_database write_into_lmdb_database, writer_into_reader, WriteMethod,
}; };
use crate::{Index, TreeLevel}; use crate::{Index, Result, TreeLevel};
pub struct WordsLevelPositions<'t, 'u, 'i> { pub struct WordsLevelPositions<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>, wtxn: &'t mut heed::RwTxn<'i, 'u>,
@ -34,7 +32,10 @@ pub struct WordsLevelPositions<'t, 'u, 'i> {
} }
impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> WordsLevelPositions<'t, 'u, 'i> { pub fn new(
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
) -> WordsLevelPositions<'t, 'u, 'i> {
WordsLevelPositions { WordsLevelPositions {
wtxn, wtxn,
index, index,
@ -144,7 +145,9 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
self.wtxn, self.wtxn,
*self.index.word_prefix_level_position_docids.as_polymorph(), *self.index.word_prefix_level_position_docids.as_polymorph(),
entries, entries,
|_, _| Err(InternalError::IndexingMergingKeys { process: "word prefix level position" }), |_, _| {
Err(InternalError::IndexingMergingKeys { process: "word prefix level position" })
},
WriteMethod::Append, WriteMethod::Append,
)?; )?;
@ -176,13 +179,11 @@ fn compute_positions_levels(
shrink_size: Option<u64>, shrink_size: Option<u64>,
level_group_size: NonZeroU32, level_group_size: NonZeroU32,
min_level_size: NonZeroU32, min_level_size: NonZeroU32,
) -> Result<Reader<FileFuse>> ) -> Result<Reader<FileFuse>> {
{
// It is forbidden to keep a cursor and write in a database at the same time with LMDB // It is forbidden to keep a cursor and write in a database at the same time with LMDB
// therefore we write the facet levels entries into a grenad file before transfering them. // therefore we write the facet levels entries into a grenad file before transfering them.
let mut writer = tempfile::tempfile().and_then(|file| { let mut writer = tempfile::tempfile()
create_writer(compression_type, compression_level, file) .and_then(|file| create_writer(compression_type, compression_level, file))?;
})?;
for result in words_db.iter(rtxn)? { for result in words_db.iter(rtxn)? {
let (word, ()) = result?; let (word, ()) = result?;
@ -193,7 +194,8 @@ fn compute_positions_levels(
left..=right left..=right
}; };
let first_level_size = words_positions_db.remap_data_type::<DecodeIgnore>() let first_level_size = words_positions_db
.remap_data_type::<DecodeIgnore>()
.range(rtxn, &level_0_range)? .range(rtxn, &level_0_range)?
.fold(Ok(0u32), |count, result| result.and(count).map(|c| c + 1))?; .fold(Ok(0u32), |count, result| result.and(count).map(|c| c + 1))?;
@ -253,8 +255,7 @@ fn write_level_entry(
left: u32, left: u32,
right: u32, right: u32,
ids: &RoaringBitmap, ids: &RoaringBitmap,
) -> Result<()> ) -> Result<()> {
{
let key = (word, level, left, right); let key = (word, level, left, right);
let key = StrLevelPositionCodec::bytes_encode(&key).ok_or(Error::Encoding)?; let key = StrLevelPositionCodec::bytes_encode(&key).ok_or(Error::Encoding)?;
let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?;

View File

@ -2,7 +2,8 @@ use std::iter::FromIterator;
use std::str; use std::str;
use fst::Streamer; use fst::Streamer;
use crate::{Index, SmallString32, Result};
use crate::{Index, Result, SmallString32};
pub struct WordsPrefixesFst<'t, 'u, 'i> { pub struct WordsPrefixesFst<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>, wtxn: &'t mut heed::RwTxn<'i, 'u>,
@ -17,8 +18,7 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>, wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index, index: &'i Index,
update_id: u64, update_id: u64,
) -> WordsPrefixesFst<'t, 'u, 'i> ) -> WordsPrefixesFst<'t, 'u, 'i> {
{
WordsPrefixesFst { WordsPrefixesFst {
wtxn, wtxn,
index, index,
@ -55,7 +55,6 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> {
let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length); let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length);
for n in 1..=self.max_prefix_length { for n in 1..=self.max_prefix_length {
let mut current_prefix = SmallString32::new(); let mut current_prefix = SmallString32::new();
let mut current_prefix_count = 0; let mut current_prefix_count = 0;
let mut builder = fst::SetBuilder::memory(); let mut builder = fst::SetBuilder::memory();

View File

@ -1,9 +1,8 @@
use milli::{Criterion, Index, DocumentId};
use milli::update::{IndexDocuments, UpdateFormat, Settings};
use big_s::S; use big_s::S;
use heed::EnvOpenOptions; use heed::EnvOpenOptions;
use maplit::{hashmap, hashset}; use maplit::{hashmap, hashset};
use milli::update::{IndexDocuments, Settings, UpdateFormat};
use milli::{Criterion, DocumentId, Index};
use serde::Deserialize; use serde::Deserialize;
use slice_group_by::GroupBy; use slice_group_by::GroupBy;
@ -11,7 +10,8 @@ mod query_criteria;
pub const TEST_QUERY: &'static str = "hello world america"; pub const TEST_QUERY: &'static str = "hello world america";
pub const EXTERNAL_DOCUMENTS_IDS: &[&str; 17] = &["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q"]; pub const EXTERNAL_DOCUMENTS_IDS: &[&str; 17] =
&["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q"];
pub const CONTENT: &str = include_str!("../assets/test_set.ndjson"); pub const CONTENT: &str = include_str!("../assets/test_set.ndjson");
@ -53,12 +53,18 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
pub fn internal_to_external_ids(index: &Index, internal_ids: &[DocumentId]) -> Vec<String> { pub fn internal_to_external_ids(index: &Index, internal_ids: &[DocumentId]) -> Vec<String> {
let mut rtxn = index.read_txn().unwrap(); let mut rtxn = index.read_txn().unwrap();
let docid_map = index.external_documents_ids(&mut rtxn).unwrap(); let docid_map = index.external_documents_ids(&mut rtxn).unwrap();
let docid_map: std::collections::HashMap<_, _> = EXTERNAL_DOCUMENTS_IDS.iter().map(|id| (docid_map.get(id).unwrap(), id)).collect(); let docid_map: std::collections::HashMap<_, _> =
EXTERNAL_DOCUMENTS_IDS.iter().map(|id| (docid_map.get(id).unwrap(), id)).collect();
internal_ids.iter().map(|id| docid_map.get(id).unwrap().to_string()).collect() internal_ids.iter().map(|id| docid_map.get(id).unwrap().to_string()).collect()
} }
pub fn expected_order(criteria: &[Criterion], authorize_typo: bool, optional_words: bool) -> Vec<TestDocument> { pub fn expected_order(
let dataset = serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect(); criteria: &[Criterion],
authorize_typo: bool,
optional_words: bool,
) -> Vec<TestDocument> {
let dataset =
serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect();
let mut groups: Vec<Vec<TestDocument>> = vec![dataset]; let mut groups: Vec<Vec<TestDocument>> = vec![dataset];
for criterion in criteria { for criterion in criteria {
@ -67,32 +73,36 @@ pub fn expected_order(criteria: &[Criterion], authorize_typo: bool, optional_wor
match criterion { match criterion {
Criterion::Attribute => { Criterion::Attribute => {
group.sort_by_key(|d| d.attribute_rank); group.sort_by_key(|d| d.attribute_rank);
new_groups.extend(group.linear_group_by_key(|d| d.attribute_rank).map(Vec::from)); new_groups
}, .extend(group.linear_group_by_key(|d| d.attribute_rank).map(Vec::from));
}
Criterion::Exactness => { Criterion::Exactness => {
group.sort_by_key(|d| d.exact_rank); group.sort_by_key(|d| d.exact_rank);
new_groups.extend(group.linear_group_by_key(|d| d.exact_rank).map(Vec::from)); new_groups.extend(group.linear_group_by_key(|d| d.exact_rank).map(Vec::from));
}, }
Criterion::Proximity => { Criterion::Proximity => {
group.sort_by_key(|d| d.proximity_rank); group.sort_by_key(|d| d.proximity_rank);
new_groups.extend(group.linear_group_by_key(|d| d.proximity_rank).map(Vec::from)); new_groups
}, .extend(group.linear_group_by_key(|d| d.proximity_rank).map(Vec::from));
}
Criterion::Typo => { Criterion::Typo => {
group.sort_by_key(|d| d.typo_rank); group.sort_by_key(|d| d.typo_rank);
new_groups.extend(group.linear_group_by_key(|d| d.typo_rank).map(Vec::from)); new_groups.extend(group.linear_group_by_key(|d| d.typo_rank).map(Vec::from));
}, }
Criterion::Words => { Criterion::Words => {
group.sort_by_key(|d| d.word_rank); group.sort_by_key(|d| d.word_rank);
new_groups.extend(group.linear_group_by_key(|d| d.word_rank).map(Vec::from)); new_groups.extend(group.linear_group_by_key(|d| d.word_rank).map(Vec::from));
}, }
Criterion::Asc(field_name) if field_name == "asc_desc_rank" => { Criterion::Asc(field_name) if field_name == "asc_desc_rank" => {
group.sort_by_key(|d| d.asc_desc_rank); group.sort_by_key(|d| d.asc_desc_rank);
new_groups.extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from)); new_groups
}, .extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from));
}
Criterion::Desc(field_name) if field_name == "asc_desc_rank" => { Criterion::Desc(field_name) if field_name == "asc_desc_rank" => {
group.sort_by_key(|d| std::cmp::Reverse(d.asc_desc_rank)); group.sort_by_key(|d| std::cmp::Reverse(d.asc_desc_rank));
new_groups.extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from)); new_groups
}, .extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from));
}
Criterion::Asc(_) | Criterion::Desc(_) => new_groups.push(group.clone()), Criterion::Asc(_) | Criterion::Desc(_) => new_groups.push(group.clone()),
} }
} }

View File

@ -1,9 +1,9 @@
use big_s::S; use big_s::S;
use milli::update::Settings; use milli::update::Settings;
use milli::{Search, SearchResult, Criterion}; use milli::{Criterion, Search, SearchResult};
use Criterion::*;
use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; use crate::search::{self, EXTERNAL_DOCUMENTS_IDS};
use Criterion::*;
const ALLOW_TYPOS: bool = true; const ALLOW_TYPOS: bool = true;
const DISALLOW_TYPOS: bool = false; const DISALLOW_TYPOS: bool = false;
@ -35,29 +35,54 @@ macro_rules! test_criterion {
} }
} }
#[rustfmt::skip]
test_criterion!(none_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS); test_criterion!(none_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS);
#[rustfmt::skip]
test_criterion!(none_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS); test_criterion!(none_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS);
#[rustfmt::skip]
test_criterion!(words_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Words); test_criterion!(words_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Words);
#[rustfmt::skip]
test_criterion!(attribute_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Attribute); test_criterion!(attribute_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Attribute);
#[rustfmt::skip]
test_criterion!(attribute_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Attribute); test_criterion!(attribute_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Attribute);
#[rustfmt::skip]
test_criterion!(exactness_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Exactness); test_criterion!(exactness_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Exactness);
#[rustfmt::skip]
test_criterion!(exactness_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Exactness); test_criterion!(exactness_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Exactness);
#[rustfmt::skip]
test_criterion!(proximity_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Proximity); test_criterion!(proximity_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Proximity);
#[rustfmt::skip]
test_criterion!(proximity_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Proximity); test_criterion!(proximity_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Proximity);
#[rustfmt::skip]
test_criterion!(asc_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Asc(S("asc_desc_rank"))); test_criterion!(asc_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Asc(S("asc_desc_rank")));
#[rustfmt::skip]
test_criterion!(asc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Asc(S("asc_desc_rank"))); test_criterion!(asc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Asc(S("asc_desc_rank")));
#[rustfmt::skip]
test_criterion!(desc_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Desc(S("asc_desc_rank"))); test_criterion!(desc_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Desc(S("asc_desc_rank")));
#[rustfmt::skip]
test_criterion!(desc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Desc(S("asc_desc_rank"))); test_criterion!(desc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Desc(S("asc_desc_rank")));
#[rustfmt::skip]
test_criterion!(asc_unexisting_field_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Asc(S("unexisting_field"))); test_criterion!(asc_unexisting_field_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Asc(S("unexisting_field")));
#[rustfmt::skip]
test_criterion!(asc_unexisting_field_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Asc(S("unexisting_field"))); test_criterion!(asc_unexisting_field_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Asc(S("unexisting_field")));
#[rustfmt::skip]
test_criterion!(desc_unexisting_field_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Desc(S("unexisting_field"))); test_criterion!(desc_unexisting_field_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Desc(S("unexisting_field")));
#[rustfmt::skip]
test_criterion!(desc_unexisting_field_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Desc(S("unexisting_field"))); test_criterion!(desc_unexisting_field_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Desc(S("unexisting_field")));
#[test] #[test]
fn criteria_mixup() { fn criteria_mixup() {
use Criterion::*; use Criterion::*;
let index = search::setup_search_index_with_criteria(&vec![Words, Attribute, Desc(S("asc_desc_rank")), Exactness, Proximity, Typo]); let index = search::setup_search_index_with_criteria(&vec![
Words,
Attribute,
Desc(S("asc_desc_rank")),
Exactness,
Proximity,
Typo,
]);
#[rustfmt::skip]
let criteria_mix = { let criteria_mix = {
// Criterion doesn't implement Copy, we create a new Criterion using a closure // Criterion doesn't implement Copy, we create a new Criterion using a closure
let desc = || Desc(S("asc_desc_rank")); let desc = || Desc(S("asc_desc_rank"));
@ -205,7 +230,8 @@ fn criteria_mixup() {
let SearchResult { documents_ids, .. } = search.execute().unwrap(); let SearchResult { documents_ids, .. } = search.execute().unwrap();
let expected_external_ids: Vec<_> = search::expected_order(&criteria, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS) let expected_external_ids: Vec<_> =
search::expected_order(&criteria, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS)
.into_iter() .into_iter()
.map(|d| d.id) .map(|d| d.id)
.collect(); .collect();

36
script/pre-commit Executable file
View File

@ -0,0 +1,36 @@
#!/usr/bin/env bash
cargo check --workspace --all-targets &>/dev/null
result=$?
if [[ ${result} -ne 0 ]] ; then
cat <<\EOF
The project does not compile. You might want to fix your error before commiting.
If you still want to commit you can do it by appending
--no-verify
at the end of your previous command.
If you are running a variant of bash you can directly paste this command in your terminal:
!! --no-verify
EOF
exit 1
fi
cargo fmt --all -- --check &>/dev/null
result=$?
if [[ ${result} -ne 0 ]] ; then
cat <<\EOF
The project is badly formatted. Please run:
cargo fmt --all
If you want to create your commit without propper formatting you can add
--no-verify
at the end of your commit.
If you are running a variant of bash you can directly paste this command in your terminal:
!! --no-verify
EOF
exit 1
fi

View File

@ -6,10 +6,9 @@ use std::time::Instant;
use byte_unit::Byte; use byte_unit::Byte;
use heed::EnvOpenOptions; use heed::EnvOpenOptions;
use log::debug; use log::debug;
use milli::{obkv_to_json, Index};
use structopt::StructOpt; use structopt::StructOpt;
use milli::{Index, obkv_to_json};
#[cfg(target_os = "linux")] #[cfg(target_os = "linux")]
#[global_allocator] #[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
@ -86,7 +85,8 @@ fn main() -> anyhow::Result<()> {
} }
if opt.print_facet_distribution { if opt.print_facet_distribution {
let facets = index.facets_distribution(&rtxn).candidates(result.candidates).execute()?; let facets =
index.facets_distribution(&rtxn).candidates(result.candidates).execute()?;
serde_json::to_writer(&mut stdout, &facets)?; serde_json::to_writer(&mut stdout, &facets)?;
let _ = writeln!(&mut stdout); let _ = writeln!(&mut stdout);
} }