184: Transfer numbers and strings facets into the appropriate facet databases r=Kerollmops a=Kerollmops

This pull request is related to https://github.com/meilisearch/milli/issues/152 and changes the layout of the facets values, numbers and strings are now in dedicated databases and the user no more needs to define the type of the fields. No more conversion between the two types is done, numbers (floats and integers converted to f64) go to the facet float database and strings go to the strings facet database.

There is one related issue that I found regarding CSVs, the values in a CSV are always considered to be strings, [meilisearch/specifications#28](d916b57d74/text/0028-indexing-csv.md) fixes this issue by allowing the user to define the fields types using `:` in the "CSV Formatting Rules" section.

All previous tests on facets have been modified to pass again and I have also done hand-driven tests with the 115m songs dataset. Everything seems to be good!

Fixes #192.

Co-authored-by: Clément Renault <clement@meilisearch.com>
Co-authored-by: Kerollmops <clement@meilisearch.com>
This commit is contained in:
bors[bot] 2021-05-31 13:32:58 +00:00 committed by GitHub
commit 2f5e61bacb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 1046 additions and 963 deletions

View File

@ -30,7 +30,6 @@ use warp::{Filter, http::Response};
use warp::filters::ws::Message; use warp::filters::ws::Message;
use milli::{FacetCondition, Index, MatchingWords, obkv_to_json, SearchResult, UpdateStore}; use milli::{FacetCondition, Index, MatchingWords, obkv_to_json, SearchResult, UpdateStore};
use milli::facet::FacetValue;
use milli::update::{IndexDocumentsMethod, Setting, UpdateBuilder, UpdateFormat}; use milli::update::{IndexDocumentsMethod, Setting, UpdateBuilder, UpdateFormat};
use milli::update::UpdateIndexingStep::*; use milli::update::UpdateIndexingStep::*;
@ -252,7 +251,7 @@ struct Settings {
searchable_attributes: Setting<Vec<String>>, searchable_attributes: Setting<Vec<String>>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")] #[serde(default, skip_serializing_if = "Setting::is_not_set")]
faceted_attributes: Setting<HashMap<String, String>>, faceted_attributes: Setting<HashSet<String>>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")] #[serde(default, skip_serializing_if = "Setting::is_not_set")]
criteria: Setting<Vec<String>>, criteria: Setting<Vec<String>>,
@ -671,7 +670,7 @@ async fn main() -> anyhow::Result<()> {
struct Answer { struct Answer {
documents: Vec<Map<String, Value>>, documents: Vec<Map<String, Value>>,
number_of_candidates: u64, number_of_candidates: u64,
facets: BTreeMap<String, BTreeMap<FacetValue, u64>>, facets: BTreeMap<String, BTreeMap<String, u64>>,
} }
let disable_highlighting = opt.disable_highlighting; let disable_highlighting = opt.disable_highlighting;
@ -985,7 +984,7 @@ async fn main() -> anyhow::Result<()> {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use maplit::{btreeset,hashmap}; use maplit::{btreeset,hashmap, hashset};
use serde_test::{assert_tokens, Token}; use serde_test::{assert_tokens, Token};
use milli::update::Setting; use milli::update::Setting;
@ -997,10 +996,10 @@ mod tests {
let settings = Settings { let settings = Settings {
displayed_attributes: Setting::Set(vec!["name".to_string()]), displayed_attributes: Setting::Set(vec!["name".to_string()]),
searchable_attributes: Setting::Set(vec!["age".to_string()]), searchable_attributes: Setting::Set(vec!["age".to_string()]),
faceted_attributes: Setting::Set(hashmap! { "age".into() => "integer".into() }), faceted_attributes: Setting::Set(hashset!{ "age".to_string() }),
criteria: Setting::Set(vec!["asc(age)".to_string()]), criteria: Setting::Set(vec!["asc(age)".to_string()]),
stop_words: Setting::Set(btreeset! { "and".to_string() }), stop_words: Setting::Set(btreeset! { "and".to_string() }),
synonyms: Setting::Set(hashmap! { "alex".to_string() => vec!["alexey".to_string()] }) synonyms: Setting::Set(hashmap!{ "alex".to_string() => vec!["alexey".to_string()] })
}; };
assert_tokens(&settings, &[ assert_tokens(&settings, &[

View File

@ -5,6 +5,7 @@ use std::{str, io, fmt};
use anyhow::Context; use anyhow::Context;
use byte_unit::Byte; use byte_unit::Byte;
use heed::EnvOpenOptions; use heed::EnvOpenOptions;
use milli::facet::FacetType;
use milli::{Index, TreeLevel}; use milli::{Index, TreeLevel};
use structopt::StructOpt; use structopt::StructOpt;
@ -22,8 +23,11 @@ const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids";
const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids"; const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids";
const WORD_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-level-position-docids"; const WORD_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-level-position-docids";
const WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-prefix-level-position-docids"; const WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-prefix-level-position-docids";
const FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME: &str = "facet-field-id-value-docids"; const FACET_ID_F64_DOCIDS_DB_NAME: &str = "facet-id-f64-docids";
const FIELD_ID_DOCID_FACET_VALUES_DB_NAME: &str = "field-id-docid-facet-values"; const FACET_ID_STRING_DOCIDS_DB_NAME: &str = "facet-id-string-docids";
const FIELD_ID_DOCID_FACET_F64S_DB_NAME: &str = "field-id-docid-facet-f64s";
const FIELD_ID_DOCID_FACET_STRINGS_DB_NAME: &str = "field-id-docid-facet-strings";
const DOCUMENTS_DB_NAME: &str = "documents"; const DOCUMENTS_DB_NAME: &str = "documents";
const ALL_DATABASE_NAMES: &[&str] = &[ const ALL_DATABASE_NAMES: &[&str] = &[
@ -35,8 +39,10 @@ const ALL_DATABASE_NAMES: &[&str] = &[
WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME, WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME,
WORD_LEVEL_POSITION_DOCIDS_DB_NAME, WORD_LEVEL_POSITION_DOCIDS_DB_NAME,
WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME, WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME,
FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME, FACET_ID_F64_DOCIDS_DB_NAME,
FIELD_ID_DOCID_FACET_VALUES_DB_NAME, FACET_ID_STRING_DOCIDS_DB_NAME,
FIELD_ID_DOCID_FACET_F64S_DB_NAME,
FIELD_ID_DOCID_FACET_STRINGS_DB_NAME,
DOCUMENTS_DB_NAME, DOCUMENTS_DB_NAME,
]; ];
@ -108,8 +114,18 @@ enum Command {
prefixes: Vec<String>, prefixes: Vec<String>,
}, },
/// Outputs a CSV with the documents ids along with the facet values where it appears. /// Outputs a CSV with the documents ids along with the facet numbers where it appears.
FacetValuesDocids { FacetNumbersDocids {
/// Display the whole documents ids in details.
#[structopt(long)]
full_display: bool,
/// The field name in the document.
field_name: String,
},
/// Outputs a CSV with the documents ids along with the facet strings where it appears.
FacetStringsDocids {
/// Display the whole documents ids in details. /// Display the whole documents ids in details.
#[structopt(long)] #[structopt(long)]
full_display: bool, full_display: bool,
@ -149,8 +165,8 @@ enum Command {
internal_documents_ids: Vec<u32>, internal_documents_ids: Vec<u32>,
}, },
/// Outputs some facets statistics for the given facet name. /// Outputs some facets numbers statistics for the given facet name.
FacetStats { FacetNumberStats {
/// The field name in the document. /// The field name in the document.
field_name: String, field_name: String,
}, },
@ -243,8 +259,11 @@ fn main() -> anyhow::Result<()> {
WordsPrefixesDocids { full_display, prefixes } => { WordsPrefixesDocids { full_display, prefixes } => {
words_prefixes_docids(&index, &rtxn, !full_display, prefixes) words_prefixes_docids(&index, &rtxn, !full_display, prefixes)
}, },
FacetValuesDocids { full_display, field_name } => { FacetNumbersDocids { full_display, field_name } => {
facet_values_docids(&index, &rtxn, !full_display, field_name) facet_values_docids(&index, &rtxn, !full_display, FacetType::Number, field_name)
},
FacetStringsDocids { full_display, field_name } => {
facet_values_docids(&index, &rtxn, !full_display, FacetType::String, field_name)
}, },
WordsLevelPositionsDocids { full_display, words } => { WordsLevelPositionsDocids { full_display, words } => {
words_level_positions_docids(&index, &rtxn, !full_display, words) words_level_positions_docids(&index, &rtxn, !full_display, words)
@ -255,7 +274,7 @@ fn main() -> anyhow::Result<()> {
DocidsWordsPositions { full_display, internal_documents_ids } => { DocidsWordsPositions { full_display, internal_documents_ids } => {
docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids) docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids)
}, },
FacetStats { field_name } => facet_stats(&index, &rtxn, field_name), FacetNumberStats { field_name } => facet_number_stats(&index, &rtxn, field_name),
AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn), AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn),
AverageNumberOfPositionsByWord => { AverageNumberOfPositionsByWord => {
average_number_of_positions_by_word(&index, &rtxn) average_number_of_positions_by_word(&index, &rtxn)
@ -297,36 +316,22 @@ fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow:
} }
/// Helper function that converts the facet value key to a unique type /// Helper function that converts the facet value key to a unique type
/// that can be used to log or display purposes. /// that can be used for log or display purposes.
fn facet_values_iter<'txn, DC: 'txn, T>( fn facet_values_iter<'txn, KC: 'txn, DC: 'txn>(
rtxn: &'txn heed::RoTxn, rtxn: &'txn heed::RoTxn,
db: heed::Database<heed::types::ByteSlice, DC>, db: heed::Database<KC, DC>,
field_id: u8, field_id: u8,
facet_type: milli::facet::FacetType, ) -> heed::Result<Box<dyn Iterator<Item=heed::Result<(KC::DItem, DC::DItem)>> + 'txn>>
string_fn: impl Fn(&str) -> T + 'txn,
float_fn: impl Fn(u8, f64, f64) -> T + 'txn,
) -> heed::Result<Box<dyn Iterator<Item=heed::Result<(T, DC::DItem)>> + 'txn>>
where where
KC: heed::BytesDecode<'txn>,
DC: heed::BytesDecode<'txn>, DC: heed::BytesDecode<'txn>,
{ {
use milli::facet::FacetType; let iter = db
use milli::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec}; .remap_key_type::<heed::types::ByteSlice>()
.prefix_iter(&rtxn, &[field_id])?
.remap_key_type::<KC>();
let iter = db.prefix_iter(&rtxn, &[field_id])?; Ok(Box::new(iter))
match facet_type {
FacetType::String => {
let iter = iter.remap_key_type::<FacetValueStringCodec>()
.map(move |r| r.map(|((_, key), value)| (string_fn(key), value)));
Ok(Box::new(iter) as Box<dyn Iterator<Item=_>>)
},
FacetType::Number => {
let iter = iter.remap_key_type::<FacetLevelValueF64Codec>()
.map(move |r| r.map(|((_, level, left, right), value)| {
(float_fn(level, left, right), value)
}));
Ok(Box::new(iter))
},
}
} }
fn facet_number_value_to_string<T: fmt::Debug>(level: u8, left: T, right: T) -> (u8, String) { fn facet_number_value_to_string<T: fmt::Debug>(level: u8, left: T, right: T) -> (u8, String) {
@ -352,9 +357,11 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
word_prefix_pair_proximity_docids, word_prefix_pair_proximity_docids,
word_level_position_docids, word_level_position_docids,
word_prefix_level_position_docids, word_prefix_level_position_docids,
facet_field_id_value_docids, facet_id_f64_docids,
field_id_docid_facet_values: _, facet_id_string_docids,
documents field_id_docid_facet_f64s: _,
field_id_docid_facet_strings: _,
documents,
} = index; } = index;
let main_name = "main"; let main_name = "main";
@ -365,7 +372,8 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
let word_pair_proximity_docids_name = "word_pair_proximity_docids"; let word_pair_proximity_docids_name = "word_pair_proximity_docids";
let word_level_position_docids_name = "word_level_position_docids"; let word_level_position_docids_name = "word_level_position_docids";
let word_prefix_level_position_docids_name = "word_prefix_level_position_docids"; let word_prefix_level_position_docids_name = "word_prefix_level_position_docids";
let facet_field_id_value_docids_name = "facet_field_id_value_docids"; let facet_id_f64_docids_name = "facet_id_f64_docids";
let facet_id_string_docids_name = "facet_id_string_docids";
let documents_name = "documents"; let documents_name = "documents";
let mut heap = BinaryHeap::with_capacity(limit + 1); let mut heap = BinaryHeap::with_capacity(limit + 1);
@ -437,27 +445,27 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
let faceted_fields = index.faceted_fields_ids(rtxn)?; let faceted_fields = index.faceted_fields_ids(rtxn)?;
let fields_ids_map = index.fields_ids_map(rtxn)?; let fields_ids_map = index.fields_ids_map(rtxn)?;
for (field_id, field_type) in faceted_fields {
let facet_name = fields_ids_map.name(field_id).unwrap();
let db = facet_field_id_value_docids.remap_data_type::<ByteSlice>(); for facet_id in faceted_fields {
let iter = facet_values_iter( let facet_name = fields_ids_map.name(facet_id).unwrap();
rtxn,
db,
field_id,
field_type,
|key| key.to_owned(),
|level, left, right| {
let mut output = facet_number_value_to_string(level, left, right).1;
let _ = write!(&mut output, " (level {})", level);
output
},
)?;
for result in iter { // List the facet numbers of this facet id.
let (fvalue, value) = result?; let db = facet_id_f64_docids.remap_data_type::<ByteSlice>();
for result in facet_values_iter(rtxn, db, facet_id)? {
let ((_fid, level, left, right), value) = result?;
let mut output = facet_number_value_to_string(level, left, right).1;
write!(&mut output, " (level {})", level)?;
let key = format!("{} {}", facet_name, output);
heap.push(Reverse((value.len(), key, facet_id_f64_docids_name)));
if heap.len() > limit { heap.pop(); }
}
// List the facet strings of this facet id.
let db = facet_id_string_docids.remap_data_type::<ByteSlice>();
for result in facet_values_iter(rtxn, db, facet_id)? {
let ((_fid, fvalue), value) = result?;
let key = format!("{} {}", facet_name, fvalue); let key = format!("{} {}", facet_name, fvalue);
heap.push(Reverse((value.len(), key, facet_field_id_value_docids_name))); heap.push(Reverse((value.len(), key, facet_id_string_docids_name)));
if heap.len() > limit { heap.pop(); } if heap.len() > limit { heap.pop(); }
} }
} }
@ -536,38 +544,55 @@ fn words_prefixes_docids(
Ok(wtr.flush()?) Ok(wtr.flush()?)
} }
fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_name: String) -> anyhow::Result<()> { fn facet_values_docids(
index: &Index,
rtxn: &heed::RoTxn,
debug: bool,
facet_type: FacetType,
field_name: String,
) -> anyhow::Result<()>
{
let fields_ids_map = index.fields_ids_map(&rtxn)?; let fields_ids_map = index.fields_ids_map(&rtxn)?;
let faceted_fields = index.faceted_fields_ids(&rtxn)?; let faceted_fields = index.faceted_fields_ids(&rtxn)?;
let field_id = fields_ids_map.id(&field_name) let field_id = fields_ids_map.id(&field_name)
.with_context(|| format!("field {} not found", field_name))?; .with_context(|| format!("field {} not found", field_name))?;
let field_type = faceted_fields.get(&field_id)
.with_context(|| format!("field {} is not faceted", field_name))?; if !faceted_fields.contains(&field_id) {
anyhow::bail!("field {} is not faceted", field_name);
}
let stdout = io::stdout(); let stdout = io::stdout();
let mut wtr = csv::Writer::from_writer(stdout.lock()); let mut wtr = csv::Writer::from_writer(stdout.lock());
wtr.write_record(&["facet_value", "facet_level", "documents_count", "documents_ids"])?;
let db = index.facet_field_id_value_docids; match facet_type {
let iter = facet_values_iter( FacetType::Number => {
rtxn, wtr.write_record(&["facet_number", "facet_level", "documents_count", "documents_ids"])?;
db, for result in facet_values_iter(rtxn, index.facet_id_f64_docids, field_id)? {
field_id, let ((_fid, level, left, right), docids) = result?;
*field_type, let value = facet_number_value_to_string(level, left, right).1;
|key| (0, key.to_owned()), let count = docids.len();
facet_number_value_to_string, let docids = if debug {
)?; format!("{:?}", docids)
} else {
for result in iter { format!("{:?}", docids.iter().collect::<Vec<_>>())
let ((level, value), docids) = result?; };
let count = docids.len(); wtr.write_record(&[value, level.to_string(), count.to_string(), docids])?;
let docids = if debug { }
format!("{:?}", docids) },
} else { FacetType::String => {
format!("{:?}", docids.iter().collect::<Vec<_>>()) wtr.write_record(&["facet_string", "documents_count", "documents_ids"])?;
}; for result in facet_values_iter(rtxn, index.facet_id_string_docids, field_id)? {
wtr.write_record(&[value, level.to_string(), count.to_string(), docids])?; let ((_fid, value), docids) = result?;
let count = docids.len();
let docids = if debug {
format!("{:?}", docids)
} else {
format!("{:?}", docids.iter().collect::<Vec<_>>())
};
wtr.write_record(&[value.to_string(), count.to_string(), docids])?;
}
}
} }
Ok(wtr.flush()?) Ok(wtr.flush()?)
@ -684,31 +709,24 @@ fn docids_words_positions(
Ok(wtr.flush()?) Ok(wtr.flush()?)
} }
fn facet_stats(index: &Index, rtxn: &heed::RoTxn, field_name: String) -> anyhow::Result<()> { fn facet_number_stats(index: &Index, rtxn: &heed::RoTxn, field_name: String) -> anyhow::Result<()> {
let fields_ids_map = index.fields_ids_map(&rtxn)?; let fields_ids_map = index.fields_ids_map(&rtxn)?;
let faceted_fields = index.faceted_fields_ids(&rtxn)?; let faceted_fields = index.faceted_fields_ids(&rtxn)?;
let field_id = fields_ids_map.id(&field_name) let field_id = fields_ids_map.id(&field_name)
.with_context(|| format!("field {} not found", field_name))?; .with_context(|| format!("field {} not found", field_name))?;
let field_type = faceted_fields.get(&field_id)
.with_context(|| format!("field {} is not faceted", field_name))?;
let db = index.facet_field_id_value_docids; if !faceted_fields.contains(&field_id) {
let iter = facet_values_iter( anyhow::bail!("field {} is not faceted", field_name);
rtxn, }
db,
field_id,
*field_type,
|_key| 0u8,
|level, _left, _right| level,
)?;
let iter = facet_values_iter(rtxn, index.facet_id_f64_docids, field_id)?;
println!("The database {:?} facet stats", field_name); println!("The database {:?} facet stats", field_name);
let mut level_size = 0; let mut level_size = 0;
let mut current_level = None; let mut current_level = None;
for result in iter { for result in iter {
let (level, _) = result?; let ((_fid, level, _left, _right), _) = result?;
if let Some(current) = current_level { if let Some(current) = current_level {
if current != level { if current != level {
println!("\tnumber of groups at level {}: {}", current, level_size); println!("\tnumber of groups at level {}: {}", current, level_size);
@ -843,7 +861,7 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a
use heed::types::ByteSlice; use heed::types::ByteSlice;
let Index { let Index {
env: _, env: _env,
main, main,
word_docids, word_docids,
word_prefix_docids, word_prefix_docids,
@ -852,8 +870,10 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a
word_prefix_pair_proximity_docids, word_prefix_pair_proximity_docids,
word_level_position_docids, word_level_position_docids,
word_prefix_level_position_docids, word_prefix_level_position_docids,
facet_field_id_value_docids, facet_id_f64_docids,
field_id_docid_facet_values, facet_id_string_docids,
field_id_docid_facet_f64s,
field_id_docid_facet_strings,
documents, documents,
} = index; } = index;
@ -873,8 +893,11 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a
WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => word_prefix_pair_proximity_docids.as_polymorph(), WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => word_prefix_pair_proximity_docids.as_polymorph(),
WORD_LEVEL_POSITION_DOCIDS_DB_NAME => word_level_position_docids.as_polymorph(), WORD_LEVEL_POSITION_DOCIDS_DB_NAME => word_level_position_docids.as_polymorph(),
WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME => word_prefix_level_position_docids.as_polymorph(), WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME => word_prefix_level_position_docids.as_polymorph(),
FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME => facet_field_id_value_docids.as_polymorph(), FACET_ID_F64_DOCIDS_DB_NAME => facet_id_f64_docids.as_polymorph(),
FIELD_ID_DOCID_FACET_VALUES_DB_NAME => field_id_docid_facet_values.as_polymorph(), FACET_ID_STRING_DOCIDS_DB_NAME => facet_id_string_docids.as_polymorph(),
FIELD_ID_DOCID_FACET_F64S_DB_NAME => field_id_docid_facet_f64s.as_polymorph(),
FIELD_ID_DOCID_FACET_STRINGS_DB_NAME => field_id_docid_facet_strings.as_polymorph(),
DOCUMENTS_DB_NAME => documents.as_polymorph(), DOCUMENTS_DB_NAME => documents.as_polymorph(),
unknown => anyhow::bail!("unknown database {:?}", unknown), unknown => anyhow::bail!("unknown database {:?}", unknown),
}; };

View File

@ -1,4 +1,4 @@
use std::collections::HashMap; use std::collections::HashSet;
use std::fmt; use std::fmt;
use anyhow::{Context, bail}; use anyhow::{Context, bail};
@ -6,8 +6,6 @@ use regex::Regex;
use serde::{Serialize, Deserialize}; use serde::{Serialize, Deserialize};
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use crate::facet::FacetType;
static ASC_DESC_REGEX: Lazy<Regex> = Lazy::new(|| { static ASC_DESC_REGEX: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(asc|desc)\(([\w_-]+)\)"#).unwrap() Regex::new(r#"(asc|desc)\(([\w_-]+)\)"#).unwrap()
}); });
@ -33,7 +31,7 @@ pub enum Criterion {
} }
impl Criterion { impl Criterion {
pub fn from_str(faceted_attributes: &HashMap<String, FacetType>, txt: &str) -> anyhow::Result<Criterion> { pub fn from_str(faceted_attributes: &HashSet<String>, txt: &str) -> anyhow::Result<Criterion> {
match txt { match txt {
"words" => Ok(Criterion::Words), "words" => Ok(Criterion::Words),
"typo" => Ok(Criterion::Typo), "typo" => Ok(Criterion::Typo),
@ -44,7 +42,9 @@ impl Criterion {
let caps = ASC_DESC_REGEX.captures(text).with_context(|| format!("unknown criterion name: {}", text))?; let caps = ASC_DESC_REGEX.captures(text).with_context(|| format!("unknown criterion name: {}", text))?;
let order = caps.get(1).unwrap().as_str(); let order = caps.get(1).unwrap().as_str();
let field_name = caps.get(2).unwrap().as_str(); let field_name = caps.get(2).unwrap().as_str();
faceted_attributes.get(field_name).with_context(|| format!("Can't use {:?} as a criterion as it isn't a faceted field.", field_name))?; faceted_attributes.get(field_name).with_context(|| {
format!("Can't use {:?} as a criterion as it isn't a faceted field.", field_name)
})?;
match order { match order {
"asc" => Ok(Criterion::Asc(field_name.to_string())), "asc" => Ok(Criterion::Asc(field_name.to_string())),
"desc" => Ok(Criterion::Desc(field_name.to_string())), "desc" => Ok(Criterion::Desc(field_name.to_string())),

View File

@ -1,5 +1,5 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::HashMap; use std::collections::{HashMap, HashSet};
use std::path::Path; use std::path::Path;
use anyhow::Context; use anyhow::Context;
@ -14,24 +14,28 @@ use crate::{
BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec,
ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec, ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec,
}; };
use crate::facet::FacetType; use crate::heed_codec::facet::{
FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
FacetValueStringCodec, FacetLevelValueF64Codec,
};
use crate::fields_ids_map::FieldsIdsMap; use crate::fields_ids_map::FieldsIdsMap;
pub const CRITERIA_KEY: &str = "criteria"; pub const CRITERIA_KEY: &str = "criteria";
pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields";
pub const DISTINCT_ATTRIBUTE_KEY: &str = "distinct-attribute-key"; pub const DISTINCT_ATTRIBUTE_KEY: &str = "distinct-attribute-key";
pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; pub const DOCUMENTS_IDS_KEY: &str = "documents-ids";
pub const FACETED_DOCUMENTS_IDS_PREFIX: &str = "faceted-documents-ids";
pub const FACETED_FIELDS_KEY: &str = "faceted-fields"; pub const FACETED_FIELDS_KEY: &str = "faceted-fields";
pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map";
pub const FIELDS_DISTRIBUTION_KEY: &str = "fields-distribution"; pub const FIELDS_DISTRIBUTION_KEY: &str = "fields-distribution";
pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map";
pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids";
pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids";
pub const PRIMARY_KEY_KEY: &str = "primary-key"; pub const PRIMARY_KEY_KEY: &str = "primary-key";
pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields"; pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields";
pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids";
pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids"; pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids";
pub const WORDS_FST_KEY: &str = "words-fst";
pub const STOP_WORDS_KEY: &str = "stop-words"; pub const STOP_WORDS_KEY: &str = "stop-words";
pub const STRING_FACETED_DOCUMENTS_IDS_PREFIX: &str = "string-faceted-documents-ids";
pub const SYNONYMS_KEY: &str = "synonyms"; pub const SYNONYMS_KEY: &str = "synonyms";
pub const WORDS_FST_KEY: &str = "words-fst";
pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst"; pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst";
const CREATED_AT_KEY: &str = "created-at"; const CREATED_AT_KEY: &str = "created-at";
const UPDATED_AT_KEY: &str = "updated-at"; const UPDATED_AT_KEY: &str = "updated-at";
@ -40,33 +44,45 @@ const UPDATED_AT_KEY: &str = "updated-at";
pub struct Index { pub struct Index {
/// The LMDB environment which this index is associated with. /// The LMDB environment which this index is associated with.
pub env: heed::Env, pub env: heed::Env,
/// Contains many different types (e.g. the fields ids map). /// Contains many different types (e.g. the fields ids map).
pub main: PolyDatabase, pub main: PolyDatabase,
/// A word and all the documents ids containing the word. /// A word and all the documents ids containing the word.
pub word_docids: Database<Str, RoaringBitmapCodec>, pub word_docids: Database<Str, RoaringBitmapCodec>,
/// A prefix of word and all the documents ids containing this prefix. /// A prefix of word and all the documents ids containing this prefix.
pub word_prefix_docids: Database<Str, RoaringBitmapCodec>, pub word_prefix_docids: Database<Str, RoaringBitmapCodec>,
/// Maps a word and a document id (u32) to all the positions where the given word appears. /// Maps a word and a document id (u32) to all the positions where the given word appears.
pub docid_word_positions: Database<BEU32StrCodec, BoRoaringBitmapCodec>, pub docid_word_positions: Database<BEU32StrCodec, BoRoaringBitmapCodec>,
/// Maps the proximity between a pair of words with all the docids where this relation appears. /// Maps the proximity between a pair of words with all the docids where this relation appears.
pub word_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>, pub word_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>,
/// Maps the proximity between a pair of word and prefix with all the docids where this relation appears. /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears.
pub word_prefix_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>, pub word_prefix_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>,
/// Maps the word, level and position range with the docids that corresponds to it. /// Maps the word, level and position range with the docids that corresponds to it.
pub word_level_position_docids: Database<StrLevelPositionCodec, CboRoaringBitmapCodec>, pub word_level_position_docids: Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
/// Maps the level positions of a word prefix with all the docids where this prefix appears. /// Maps the level positions of a word prefix with all the docids where this prefix appears.
pub word_prefix_level_position_docids: Database<StrLevelPositionCodec, CboRoaringBitmapCodec>, pub word_prefix_level_position_docids: Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
/// Maps the facet field id and the globally ordered value with the docids that corresponds to it.
pub facet_field_id_value_docids: Database<ByteSlice, CboRoaringBitmapCodec>, /// Maps the facet field id, level and the number with the docids that corresponds to it.
/// Maps the document id, the facet field id and the globally ordered value. pub facet_id_f64_docids: Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
pub field_id_docid_facet_values: Database<ByteSlice, Unit>, /// Maps the facet field id and the string with the docids that corresponds to it.
pub facet_id_string_docids: Database<FacetValueStringCodec, CboRoaringBitmapCodec>,
/// Maps the document id, the facet field id and the numbers.
pub field_id_docid_facet_f64s: Database<FieldDocIdFacetF64Codec, Unit>,
/// Maps the document id, the facet field id and the strings.
pub field_id_docid_facet_strings: Database<FieldDocIdFacetStringCodec, Unit>,
/// Maps the document id to the document as an obkv store. /// Maps the document id to the document as an obkv store.
pub documents: Database<OwnedType<BEU32>, ObkvCodec>, pub documents: Database<OwnedType<BEU32>, ObkvCodec>,
} }
impl Index { impl Index {
pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result<Index> { pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result<Index> {
options.max_dbs(11); options.max_dbs(13);
let env = options.open(path)?; let env = options.open(path)?;
let main = env.create_poly_database(Some("main"))?; let main = env.create_poly_database(Some("main"))?;
@ -77,20 +93,13 @@ impl Index {
let word_prefix_pair_proximity_docids = env.create_database(Some("word-prefix-pair-proximity-docids"))?; let word_prefix_pair_proximity_docids = env.create_database(Some("word-prefix-pair-proximity-docids"))?;
let word_level_position_docids = env.create_database(Some("word-level-position-docids"))?; let word_level_position_docids = env.create_database(Some("word-level-position-docids"))?;
let word_prefix_level_position_docids = env.create_database(Some("word-prefix-level-position-docids"))?; let word_prefix_level_position_docids = env.create_database(Some("word-prefix-level-position-docids"))?;
let facet_field_id_value_docids = env.create_database(Some("facet-field-id-value-docids"))?; let facet_id_f64_docids = env.create_database(Some("facet-id-f64-docids"))?;
let field_id_docid_facet_values = env.create_database(Some("field-id-docid-facet-values"))?; let facet_id_string_docids = env.create_database(Some("facet-id-string-docids"))?;
let field_id_docid_facet_f64s = env.create_database(Some("field-id-docid-facet-f64s"))?;
let field_id_docid_facet_strings = env.create_database(Some("field-id-docid-facet-strings"))?;
let documents = env.create_database(Some("documents"))?; let documents = env.create_database(Some("documents"))?;
{ Index::initialize_creation_dates(&env, main)?;
let mut txn = env.write_txn()?;
// The db was just created, we update its metadata with the relevant information.
if main.get::<_, Str, SerdeJson<DateTime<Utc>>>(&txn, CREATED_AT_KEY)?.is_none() {
let now = Utc::now();
main.put::<_, Str, SerdeJson<DateTime<Utc>>>(&mut txn, UPDATED_AT_KEY, &now)?;
main.put::<_, Str, SerdeJson<DateTime<Utc>>>(&mut txn, CREATED_AT_KEY, &now)?;
txn.commit()?;
}
}
Ok(Index { Ok(Index {
env, env,
@ -102,12 +111,26 @@ impl Index {
word_prefix_pair_proximity_docids, word_prefix_pair_proximity_docids,
word_level_position_docids, word_level_position_docids,
word_prefix_level_position_docids, word_prefix_level_position_docids,
facet_field_id_value_docids, facet_id_f64_docids,
field_id_docid_facet_values, facet_id_string_docids,
field_id_docid_facet_f64s,
field_id_docid_facet_strings,
documents, documents,
}) })
} }
fn initialize_creation_dates(env: &heed::Env, main: PolyDatabase) -> heed::Result<()> {
let mut txn = env.write_txn()?;
// The db was just created, we update its metadata with the relevant information.
if main.get::<_, Str, SerdeJson<DateTime<Utc>>>(&txn, CREATED_AT_KEY)?.is_none() {
let now = Utc::now();
main.put::<_, Str, SerdeJson<DateTime<Utc>>>(&mut txn, UPDATED_AT_KEY, &now)?;
main.put::<_, Str, SerdeJson<DateTime<Utc>>>(&mut txn, CREATED_AT_KEY, &now)?;
txn.commit()?;
}
Ok(())
}
/// Create a write transaction to be able to write into the index. /// Create a write transaction to be able to write into the index.
pub fn write_txn(&self) -> heed::Result<RwTxn> { pub fn write_txn(&self) -> heed::Result<RwTxn> {
self.env.write_txn() self.env.write_txn()
@ -298,53 +321,97 @@ impl Index {
/* faceted fields */ /* faceted fields */
/// Writes the facet fields associated with their facet type or `None` if /// Writes the facet fields names in the database.
/// the facet type is currently unknown. pub fn put_faceted_fields(&self, wtxn: &mut RwTxn, fields: &HashSet<String>) -> heed::Result<()> {
pub fn put_faceted_fields(&self, wtxn: &mut RwTxn, fields_types: &HashMap<String, FacetType>) -> heed::Result<()> { self.main.put::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY, fields)
self.main.put::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY, fields_types)
} }
/// Deletes the facet fields ids associated with their facet type. /// Deletes the facet fields ids in the database.
pub fn delete_faceted_fields(&self, wtxn: &mut RwTxn) -> heed::Result<bool> { pub fn delete_faceted_fields(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
self.main.delete::<_, Str>(wtxn, FACETED_FIELDS_KEY) self.main.delete::<_, Str>(wtxn, FACETED_FIELDS_KEY)
} }
/// Returns the facet fields names associated with their facet type. /// Returns the facet fields names.
pub fn faceted_fields(&self, rtxn: &RoTxn) -> heed::Result<HashMap<String, FacetType>> { pub fn faceted_fields(&self, rtxn: &RoTxn) -> heed::Result<HashSet<String>> {
Ok(self.main.get::<_, Str, SerdeJson<_>>(rtxn, FACETED_FIELDS_KEY)?.unwrap_or_default()) Ok(self.main.get::<_, Str, SerdeJson<_>>(rtxn, FACETED_FIELDS_KEY)?.unwrap_or_default())
} }
/// Same as `faceted_fields`, but returns ids instead. /// Same as `faceted_fields`, but returns ids instead.
pub fn faceted_fields_ids(&self, rtxn: &RoTxn) -> heed::Result<HashMap<FieldId, FacetType>> { pub fn faceted_fields_ids(&self, rtxn: &RoTxn) -> heed::Result<HashSet<FieldId>> {
let faceted_fields = self.faceted_fields(rtxn)?; let faceted_fields = self.faceted_fields(rtxn)?;
let fields_ids_map = self.fields_ids_map(rtxn)?; let fields_ids_map = self.fields_ids_map(rtxn)?;
let faceted_fields = faceted_fields let faceted_fields = faceted_fields
.iter() .iter()
.map(|(k, v)| { .map(|k| {
let kid = fields_ids_map fields_ids_map
.id(k) .id(k)
.ok_or_else(|| format!("{:?} should be present in the field id map", k)) .ok_or_else(|| format!("{:?} should be present in the field id map", k))
.expect("corrupted data: "); .expect("corrupted data: ")
(kid, *v)
}) })
.collect(); .collect();
Ok(faceted_fields) Ok(faceted_fields)
} }
/* faceted documents ids */ /* faceted documents ids */
/// Writes the documents ids that are faceted under this field id. /// Writes the documents ids that are faceted with numbers under this field id.
pub fn put_faceted_documents_ids(&self, wtxn: &mut RwTxn, field_id: FieldId, docids: &RoaringBitmap) -> heed::Result<()> { pub fn put_number_faceted_documents_ids(
let mut buffer = [0u8; FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; &self,
buffer[..FACETED_DOCUMENTS_IDS_PREFIX.len()].clone_from_slice(FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); wtxn: &mut RwTxn,
field_id: FieldId,
docids: &RoaringBitmap,
) -> heed::Result<()>
{
let mut buffer = [0u8; STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
buffer[..STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()]
.copy_from_slice(STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
*buffer.last_mut().unwrap() = field_id; *buffer.last_mut().unwrap() = field_id;
self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids)
} }
/// Retrieve all the documents ids that faceted under this field id. /// Retrieve all the documents ids that faceted with numbers under this field id.
pub fn faceted_documents_ids(&self, rtxn: &RoTxn, field_id: FieldId) -> heed::Result<RoaringBitmap> { pub fn number_faceted_documents_ids(
let mut buffer = [0u8; FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; &self,
buffer[..FACETED_DOCUMENTS_IDS_PREFIX.len()].clone_from_slice(FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); rtxn: &RoTxn,
field_id: FieldId,
) -> heed::Result<RoaringBitmap>
{
let mut buffer = [0u8; STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
buffer[..STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()]
.copy_from_slice(STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
*buffer.last_mut().unwrap() = field_id;
match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? {
Some(docids) => Ok(docids),
None => Ok(RoaringBitmap::new()),
}
}
/// Writes the documents ids that are faceted with strings under this field id.
pub fn put_string_faceted_documents_ids(
&self,
wtxn: &mut RwTxn,
field_id: FieldId,
docids: &RoaringBitmap,
) -> heed::Result<()>
{
let mut buffer = [0u8; NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
buffer[..NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()]
.copy_from_slice(NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
*buffer.last_mut().unwrap() = field_id;
self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids)
}
/// Retrieve all the documents ids that faceted with strings under this field id.
pub fn string_faceted_documents_ids(
&self,
rtxn: &RoTxn,
field_id: FieldId,
) -> heed::Result<RoaringBitmap>
{
let mut buffer = [0u8; NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
buffer[..NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()]
.copy_from_slice(NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
*buffer.last_mut().unwrap() = field_id; *buffer.last_mut().unwrap() = field_id;
match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? {
Some(docids) => Ok(docids), Some(docids) => Ok(docids),

View File

@ -1,18 +1,15 @@
use std::collections::HashMap;
use std::mem::take; use std::mem::take;
use anyhow::{bail, Context as _}; use anyhow::Context;
use itertools::Itertools; use itertools::Itertools;
use log::debug; use log::debug;
use ordered_float::OrderedFloat; use ordered_float::OrderedFloat;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use crate::facet::FacetType;
use crate::heed_codec::facet::FieldDocIdFacetF64Codec;
use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder};
use crate::search::facet::FacetIter; use crate::search::facet::FacetIter;
use crate::search::query_tree::Operation; use crate::search::query_tree::Operation;
use crate::{FieldsIdsMap, FieldId, Index}; use crate::{FieldId, Index};
use super::{Criterion, CriterionParameters, CriterionResult}; use super::{Criterion, CriterionParameters, CriterionResult};
/// Threshold on the number of candidates that will make /// Threshold on the number of candidates that will make
@ -24,7 +21,6 @@ pub struct AscDesc<'t> {
rtxn: &'t heed::RoTxn<'t>, rtxn: &'t heed::RoTxn<'t>,
field_name: String, field_name: String,
field_id: FieldId, field_id: FieldId,
facet_type: FacetType,
ascending: bool, ascending: bool,
query_tree: Option<Operation>, query_tree: Option<Operation>,
candidates: Box<dyn Iterator<Item = heed::Result<RoaringBitmap>> + 't>, candidates: Box<dyn Iterator<Item = heed::Result<RoaringBitmap>> + 't>,
@ -39,8 +35,7 @@ impl<'t> AscDesc<'t> {
rtxn: &'t heed::RoTxn, rtxn: &'t heed::RoTxn,
parent: Box<dyn Criterion + 't>, parent: Box<dyn Criterion + 't>,
field_name: String, field_name: String,
) -> anyhow::Result<Self> ) -> anyhow::Result<Self> {
{
Self::new(index, rtxn, parent, field_name, true) Self::new(index, rtxn, parent, field_name, true)
} }
@ -49,8 +44,7 @@ impl<'t> AscDesc<'t> {
rtxn: &'t heed::RoTxn, rtxn: &'t heed::RoTxn,
parent: Box<dyn Criterion + 't>, parent: Box<dyn Criterion + 't>,
field_name: String, field_name: String,
) -> anyhow::Result<Self> ) -> anyhow::Result<Self> {
{
Self::new(index, rtxn, parent, field_name, false) Self::new(index, rtxn, parent, field_name, false)
} }
@ -60,22 +54,21 @@ impl<'t> AscDesc<'t> {
parent: Box<dyn Criterion + 't>, parent: Box<dyn Criterion + 't>,
field_name: String, field_name: String,
ascending: bool, ascending: bool,
) -> anyhow::Result<Self> ) -> anyhow::Result<Self> {
{
let fields_ids_map = index.fields_ids_map(rtxn)?; let fields_ids_map = index.fields_ids_map(rtxn)?;
let faceted_fields = index.faceted_fields(rtxn)?; let field_id = fields_ids_map
let (field_id, facet_type) = field_id_facet_type(&fields_ids_map, &faceted_fields, &field_name)?; .id(&field_name)
.with_context(|| format!("field {:?} isn't registered", field_name))?;
Ok(AscDesc { Ok(AscDesc {
index, index,
rtxn, rtxn,
field_name, field_name,
field_id, field_id,
facet_type,
ascending, ascending,
query_tree: None, query_tree: None,
candidates: Box::new(std::iter::empty()), candidates: Box::new(std::iter::empty()),
faceted_candidates: index.faceted_documents_ids(rtxn, field_id)?, faceted_candidates: index.number_faceted_documents_ids(rtxn, field_id)?,
bucket_candidates: RoaringBitmap::new(), bucket_candidates: RoaringBitmap::new(),
parent, parent,
}) })
@ -86,8 +79,10 @@ impl<'t> Criterion for AscDesc<'t> {
#[logging_timer::time("AscDesc::{}")] #[logging_timer::time("AscDesc::{}")]
fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result<Option<CriterionResult>> { fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result<Option<CriterionResult>> {
loop { loop {
debug!("Facet {}({}) iteration", debug!(
if self.ascending { "Asc" } else { "Desc" }, self.field_name "Facet {}({}) iteration",
if self.ascending { "Asc" } else { "Desc" },
self.field_name
); );
match self.candidates.next().transpose()? { match self.candidates.next().transpose()? {
@ -122,7 +117,6 @@ impl<'t> Criterion for AscDesc<'t> {
self.index, self.index,
self.rtxn, self.rtxn,
self.field_id, self.field_id,
self.facet_type,
self.ascending, self.ascending,
candidates, candidates,
)?; )?;
@ -138,27 +132,12 @@ impl<'t> Criterion for AscDesc<'t> {
filtered_candidates: None, filtered_candidates: None,
bucket_candidates: Some(take(&mut self.bucket_candidates)), bucket_candidates: Some(take(&mut self.bucket_candidates)),
})); }));
}, }
} }
} }
} }
} }
fn field_id_facet_type(
fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<String, FacetType>,
field: &str,
) -> anyhow::Result<(FieldId, FacetType)>
{
let id = fields_ids_map.id(field).with_context(|| {
format!("field {:?} isn't registered", field)
})?;
let facet_type = faceted_fields.get(field).with_context(|| {
format!("field {:?} isn't faceted", field)
})?;
Ok((id, *facet_type))
}
/// Returns an iterator over groups of the given candidates in ascending or descending order. /// Returns an iterator over groups of the given candidates in ascending or descending order.
/// ///
/// It will either use an iterative or a recursive method on the whole facet database depending /// It will either use an iterative or a recursive method on the whole facet database depending
@ -167,29 +146,20 @@ fn facet_ordered<'t>(
index: &'t Index, index: &'t Index,
rtxn: &'t heed::RoTxn, rtxn: &'t heed::RoTxn,
field_id: FieldId, field_id: FieldId,
facet_type: FacetType,
ascending: bool, ascending: bool,
candidates: RoaringBitmap, candidates: RoaringBitmap,
) -> anyhow::Result<Box<dyn Iterator<Item = heed::Result<RoaringBitmap>> + 't>> ) -> anyhow::Result<Box<dyn Iterator<Item = heed::Result<RoaringBitmap>> + 't>> {
{ if candidates.len() <= CANDIDATES_THRESHOLD {
match facet_type { let iter = iterative_facet_ordered_iter(index, rtxn, field_id, ascending, candidates)?;
FacetType::Number => { Ok(Box::new(iter.map(Ok)) as Box<dyn Iterator<Item = _>>)
if candidates.len() <= CANDIDATES_THRESHOLD { } else {
let iter = iterative_facet_ordered_iter( let facet_fn = if ascending {
index, rtxn, field_id, ascending, candidates, FacetIter::new_reducing
)?; } else {
Ok(Box::new(iter.map(Ok)) as Box<dyn Iterator<Item = _>>) FacetIter::new_reverse_reducing
} else { };
let facet_fn = if ascending { let iter = facet_fn(rtxn, index, field_id, candidates)?;
FacetIter::new_reducing Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids))))
} else {
FacetIter::new_reverse_reducing
};
let iter = facet_fn(rtxn, index, field_id, candidates)?;
Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids))))
}
},
FacetType::String => bail!("criteria facet type must be a number"),
} }
} }
@ -202,14 +172,14 @@ fn iterative_facet_ordered_iter<'t>(
field_id: FieldId, field_id: FieldId,
ascending: bool, ascending: bool,
candidates: RoaringBitmap, candidates: RoaringBitmap,
) -> anyhow::Result<impl Iterator<Item = RoaringBitmap> + 't> ) -> anyhow::Result<impl Iterator<Item = RoaringBitmap> + 't> {
{
let db = index.field_id_docid_facet_values.remap_key_type::<FieldDocIdFacetF64Codec>();
let mut docids_values = Vec::with_capacity(candidates.len() as usize); let mut docids_values = Vec::with_capacity(candidates.len() as usize);
for docid in candidates.iter() { for docid in candidates.iter() {
let left = (field_id, docid, f64::MIN); let left = (field_id, docid, f64::MIN);
let right = (field_id, docid, f64::MAX); let right = (field_id, docid, f64::MAX);
let mut iter = db.range(rtxn, &(left..=right))?; let mut iter = index
.field_id_docid_facet_f64s
.range(rtxn, &(left..=right))?;
let entry = if ascending { iter.next() } else { iter.last() }; let entry = if ascending { iter.next() } else { iter.last() };
if let Some(((_, _, value), ())) = entry.transpose()? { if let Some(((_, _, value), ())) = entry.transpose()? {
docids_values.push((docid, OrderedFloat(value))); docids_values.push((docid, OrderedFloat(value)));
@ -226,7 +196,8 @@ fn iterative_facet_ordered_iter<'t>(
// The itertools GroupBy iterator doesn't provide an owned version, we are therefore // The itertools GroupBy iterator doesn't provide an owned version, we are therefore
// required to collect the result into an owned collection (a Vec). // required to collect the result into an owned collection (a Vec).
// https://github.com/rust-itertools/itertools/issues/499 // https://github.com/rust-itertools/itertools/issues/499
let vec: Vec<_> = iter.group_by(|(_, v)| *v) let vec: Vec<_> = iter
.group_by(|(_, v)| v.clone())
.into_iter() .into_iter()
.map(|(_, ids)| ids.map(|(id, _)| id).collect()) .map(|(_, ids)| ids.map(|(id, _)| id).collect())
.collect(); .collect();

View File

@ -1,10 +1,14 @@
use std::mem::size_of; use std::mem::size_of;
use heed::types::ByteSlice;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use crate::heed_codec::facet::*;
use crate::{facet::FacetType, DocumentId, FieldId, Index};
use super::{Distinct, DocIter}; use super::{Distinct, DocIter};
use crate::heed_codec::facet::*;
use crate::{DocumentId, FieldId, Index};
const FID_SIZE: usize = size_of::<FieldId>();
const DOCID_SIZE: usize = size_of::<DocumentId>();
/// A distinct implementer that is backed by facets. /// A distinct implementer that is backed by facets.
/// ///
@ -18,21 +22,14 @@ pub struct FacetDistinct<'a> {
distinct: FieldId, distinct: FieldId,
index: &'a Index, index: &'a Index,
txn: &'a heed::RoTxn<'a>, txn: &'a heed::RoTxn<'a>,
facet_type: FacetType,
} }
impl<'a> FacetDistinct<'a> { impl<'a> FacetDistinct<'a> {
pub fn new( pub fn new(distinct: FieldId, index: &'a Index, txn: &'a heed::RoTxn<'a>) -> Self {
distinct: FieldId,
index: &'a Index,
txn: &'a heed::RoTxn<'a>,
facet_type: FacetType,
) -> Self {
Self { Self {
distinct, distinct,
index, index,
txn, txn,
facet_type,
} }
} }
} }
@ -41,38 +38,33 @@ pub struct FacetDistinctIter<'a> {
candidates: RoaringBitmap, candidates: RoaringBitmap,
distinct: FieldId, distinct: FieldId,
excluded: RoaringBitmap, excluded: RoaringBitmap,
facet_type: FacetType,
index: &'a Index, index: &'a Index,
iter_offset: usize, iter_offset: usize,
txn: &'a heed::RoTxn<'a>, txn: &'a heed::RoTxn<'a>,
} }
impl<'a> FacetDistinctIter<'a> { impl<'a> FacetDistinctIter<'a> {
fn get_facet_docids<'c, KC>(&self, key: &'c KC::EItem) -> anyhow::Result<RoaringBitmap> fn facet_string_docids(&self, key: &str) -> heed::Result<Option<RoaringBitmap>> {
where self.index
KC: heed::BytesEncode<'c>, .facet_id_string_docids
{ .get(self.txn, &(self.distinct, key))
let facet_docids = self }
.index
.facet_field_id_value_docids fn facet_number_docids(&self, key: f64) -> heed::Result<Option<RoaringBitmap>> {
.remap_key_type::<KC>() // get facet docids on level 0
.get(self.txn, key)? self.index
.expect("Corrupted data: Facet values must exist"); .facet_id_f64_docids
Ok(facet_docids) .get(self.txn, &(self.distinct, 0, key, key))
} }
fn distinct_string(&mut self, id: DocumentId) -> anyhow::Result<()> { fn distinct_string(&mut self, id: DocumentId) -> anyhow::Result<()> {
let iter = get_facet_values::<FieldDocIdFacetStringCodec>( let iter = facet_string_values(id, self.distinct, self.index, self.txn)?;
id,
self.distinct,
self.index,
self.txn,
)?;
for item in iter { for item in iter {
let ((_, _, value), _) = item?; let ((_, _, value), _) = item?;
let key = (self.distinct, value); let facet_docids = self
let facet_docids = self.get_facet_docids::<FacetValueStringCodec>(&key)?; .facet_string_docids(value)?
.expect("Corrupted data: Facet values must exist");
self.excluded.union_with(&facet_docids); self.excluded.union_with(&facet_docids);
} }
@ -82,17 +74,13 @@ impl<'a> FacetDistinctIter<'a> {
} }
fn distinct_number(&mut self, id: DocumentId) -> anyhow::Result<()> { fn distinct_number(&mut self, id: DocumentId) -> anyhow::Result<()> {
let iter = get_facet_values::<FieldDocIdFacetF64Codec>(id, let iter = facet_number_values(id, self.distinct, self.index, self.txn)?;
self.distinct,
self.index,
self.txn,
)?;
for item in iter { for item in iter {
let ((_, _, value), _) = item?; let ((_, _, value), _) = item?;
// get facet docids on level 0 let facet_docids = self
let key = (self.distinct, 0, value, value); .facet_number_docids(value)?
let facet_docids = self.get_facet_docids::<FacetLevelValueF64Codec>(&key)?; .expect("Corrupted data: Facet values must exist");
self.excluded.union_with(&facet_docids); self.excluded.union_with(&facet_docids);
} }
@ -111,16 +99,16 @@ impl<'a> FacetDistinctIter<'a> {
let mut candidates_iter = self.candidates.iter().skip(self.iter_offset); let mut candidates_iter = self.candidates.iter().skip(self.iter_offset);
match candidates_iter.next() { match candidates_iter.next() {
Some(id) => { Some(id) => {
match self.facet_type { // We distinct the document id on its facet strings and facet numbers.
FacetType::String => self.distinct_string(id)?, self.distinct_string(id)?;
FacetType::Number => self.distinct_number(id)?, self.distinct_number(id)?;
};
// The first document of each iteration is kept, since the next call to // The first document of each iteration is kept, since the next call to
// `difference_with` will filter out all the documents for that facet value. By // `difference_with` will filter out all the documents for that facet value. By
// increasing the offset we make sure to get the first valid value for the next // increasing the offset we make sure to get the first valid value for the next
// distinct document to keep. // distinct document to keep.
self.iter_offset += 1; self.iter_offset += 1;
Ok(Some(id)) Ok(Some(id))
} }
// no more candidate at this offset, return. // no more candidate at this offset, return.
@ -129,26 +117,44 @@ impl<'a> FacetDistinctIter<'a> {
} }
} }
fn get_facet_values<'a, KC>( fn facet_values_prefix_key(distinct: FieldId, id: DocumentId) -> [u8; FID_SIZE + DOCID_SIZE] {
let mut key = [0; FID_SIZE + DOCID_SIZE];
key[0..FID_SIZE].copy_from_slice(&distinct.to_be_bytes());
key[FID_SIZE..].copy_from_slice(&id.to_be_bytes());
key
}
fn facet_number_values<'a>(
id: DocumentId, id: DocumentId,
distinct: FieldId, distinct: FieldId,
index: &Index, index: &Index,
txn: &'a heed::RoTxn, txn: &'a heed::RoTxn,
) -> anyhow::Result<heed::RoPrefix<'a, KC, heed::types::Unit>> ) -> anyhow::Result<heed::RoPrefix<'a, FieldDocIdFacetF64Codec, heed::types::Unit>> {
where let key = facet_values_prefix_key(distinct, id);
KC: heed::BytesDecode<'a>,
{
const FID_SIZE: usize = size_of::<FieldId>();
const DOCID_SIZE: usize = size_of::<DocumentId>();
let mut key = [0; FID_SIZE + DOCID_SIZE];
key[0..FID_SIZE].copy_from_slice(&distinct.to_be_bytes());
key[FID_SIZE..].copy_from_slice(&id.to_be_bytes());
let iter = index let iter = index
.field_id_docid_facet_values .field_id_docid_facet_f64s
.remap_key_type::<ByteSlice>()
.prefix_iter(txn, &key)? .prefix_iter(txn, &key)?
.remap_key_type::<KC>(); .remap_key_type::<FieldDocIdFacetF64Codec>();
Ok(iter)
}
fn facet_string_values<'a>(
id: DocumentId,
distinct: FieldId,
index: &Index,
txn: &'a heed::RoTxn,
) -> anyhow::Result<heed::RoPrefix<'a, FieldDocIdFacetStringCodec, heed::types::Unit>> {
let key = facet_values_prefix_key(distinct, id);
let iter = index
.field_id_docid_facet_strings
.remap_key_type::<ByteSlice>()
.prefix_iter(txn, &key)?
.remap_key_type::<FieldDocIdFacetStringCodec>();
Ok(iter) Ok(iter)
} }
@ -174,7 +180,6 @@ impl<'a> Distinct<'_> for FacetDistinct<'a> {
candidates, candidates,
distinct: self.distinct, distinct: self.distinct,
excluded, excluded,
facet_type: self.facet_type,
index: self.index, index: self.index,
iter_offset: 0, iter_offset: 0,
txn: self.txn, txn: self.txn,
@ -184,22 +189,21 @@ impl<'a> Distinct<'_> for FacetDistinct<'a> {
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use std::collections::HashMap; use std::collections::HashSet;
use super::*;
use super::super::test::{generate_index, validate_distinct_candidates}; use super::super::test::{generate_index, validate_distinct_candidates};
use crate::facet::FacetType; use super::*;
macro_rules! test_facet_distinct { macro_rules! test_facet_distinct {
($name:ident, $distinct:literal, $facet_type:expr) => { ($name:ident, $distinct:literal) => {
#[test] #[test]
fn $name() { fn $name() {
use std::iter::FromIterator; use std::iter::FromIterator;
let facets = HashMap::from_iter(Some(($distinct.to_string(), $facet_type.to_string()))); let facets = HashSet::from_iter(Some(($distinct.to_string())));
let (index, fid, candidates) = generate_index($distinct, facets); let (index, fid, candidates) = generate_index($distinct, facets);
let txn = index.read_txn().unwrap(); let txn = index.read_txn().unwrap();
let mut map_distinct = FacetDistinct::new(fid, &index, &txn, $facet_type); let mut map_distinct = FacetDistinct::new(fid, &index, &txn);
let excluded = RoaringBitmap::new(); let excluded = RoaringBitmap::new();
let mut iter = map_distinct.distinct(candidates.clone(), excluded); let mut iter = map_distinct.distinct(candidates.clone(), excluded);
let count = validate_distinct_candidates(iter.by_ref(), fid, &index); let count = validate_distinct_candidates(iter.by_ref(), fid, &index);
@ -209,7 +213,7 @@ mod test {
}; };
} }
test_facet_distinct!(test_string, "txt", FacetType::String); test_facet_distinct!(test_string, "txt");
test_facet_distinct!(test_strings, "txts", FacetType::String); test_facet_distinct!(test_strings, "txts");
test_facet_distinct!(test_number, "cat-int", FacetType::Number); test_facet_distinct!(test_number, "cat-int");
} }

View File

@ -110,7 +110,7 @@ impl<'a, 'b> Distinct<'b> for MapDistinct<'a> {
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use std::collections::HashMap; use std::collections::HashSet;
use super::*; use super::*;
use super::super::test::{generate_index, validate_distinct_candidates}; use super::super::test::{generate_index, validate_distinct_candidates};
@ -119,7 +119,7 @@ mod test {
($name:ident, $distinct:literal) => { ($name:ident, $distinct:literal) => {
#[test] #[test]
fn $name() { fn $name() {
let (index, fid, candidates) = generate_index($distinct, HashMap::new()); let (index, fid, candidates) = generate_index($distinct, HashSet::new());
let txn = index.read_txn().unwrap(); let txn = index.read_txn().unwrap();
let mut map_distinct = MapDistinct::new(fid, &index, &txn); let mut map_distinct = MapDistinct::new(fid, &index, &txn);
let excluded = RoaringBitmap::new(); let excluded = RoaringBitmap::new();

View File

@ -28,7 +28,7 @@ pub trait Distinct<'a> {
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use std::collections::{HashMap, HashSet}; use std::collections::HashSet;
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use rand::{seq::SliceRandom, Rng}; use rand::{seq::SliceRandom, Rng};
@ -74,7 +74,7 @@ mod test {
/// Returns a temporary index populated with random test documents, the FieldId for the /// Returns a temporary index populated with random test documents, the FieldId for the
/// distinct attribute, and the RoaringBitmap with the document ids. /// distinct attribute, and the RoaringBitmap with the document ids.
pub(crate) fn generate_index(distinct: &str, facets: HashMap<String, String>) -> (TempIndex, FieldId, RoaringBitmap) { pub(crate) fn generate_index(distinct: &str, facets: HashSet<String>) -> (TempIndex, FieldId, RoaringBitmap) {
let index = TempIndex::new(); let index = TempIndex::new();
let mut txn = index.write_txn().unwrap(); let mut txn = index.write_txn().unwrap();

View File

@ -1,9 +1,8 @@
use std::collections::HashMap; use std::collections::HashSet;
use std::fmt::Debug; use std::fmt::Debug;
use std::ops::Bound::{self, Included, Excluded}; use std::ops::Bound::{self, Included, Excluded};
use std::str::FromStr; use std::str::FromStr;
use anyhow::Context;
use either::Either; use either::Either;
use heed::types::DecodeIgnore; use heed::types::DecodeIgnore;
use log::debug; use log::debug;
@ -12,7 +11,6 @@ use pest::iterators::{Pair, Pairs};
use pest::Parser; use pest::Parser;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use crate::facet::FacetType;
use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec}; use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec};
use crate::{Index, FieldId, FieldsIdsMap, CboRoaringBitmapCodec}; use crate::{Index, FieldId, FieldsIdsMap, CboRoaringBitmapCodec};
@ -21,122 +19,96 @@ use super::parser::Rule;
use super::parser::{PREC_CLIMBER, FilterParser}; use super::parser::{PREC_CLIMBER, FilterParser};
use self::FacetCondition::*; use self::FacetCondition::*;
use self::FacetNumberOperator::*; use self::Operator::*;
#[derive(Debug, Copy, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq)]
pub enum FacetNumberOperator { pub enum Operator {
GreaterThan(f64), GreaterThan(f64),
GreaterThanOrEqual(f64), GreaterThanOrEqual(f64),
Equal(f64), Equal(Option<f64>, String),
NotEqual(f64), NotEqual(Option<f64>, String),
LowerThan(f64), LowerThan(f64),
LowerThanOrEqual(f64), LowerThanOrEqual(f64),
Between(f64, f64), Between(f64, f64),
} }
impl FacetNumberOperator { impl Operator {
/// This method can return two operations in case it must express /// This method can return two operations in case it must express
/// an OR operation for the between case (i.e. `TO`). /// an OR operation for the between case (i.e. `TO`).
fn negate(self) -> (Self, Option<Self>) { fn negate(self) -> (Self, Option<Self>) {
match self { match self {
GreaterThan(x) => (LowerThanOrEqual(x), None), GreaterThan(n) => (LowerThanOrEqual(n), None),
GreaterThanOrEqual(x) => (LowerThan(x), None), GreaterThanOrEqual(n) => (LowerThan(n), None),
Equal(x) => (NotEqual(x), None), Equal(n, s) => (NotEqual(n, s), None),
NotEqual(x) => (Equal(x), None), NotEqual(n, s) => (Equal(n, s), None),
LowerThan(x) => (GreaterThanOrEqual(x), None), LowerThan(n) => (GreaterThanOrEqual(n), None),
LowerThanOrEqual(x) => (GreaterThan(x), None), LowerThanOrEqual(n) => (GreaterThan(n), None),
Between(x, y) => (LowerThan(x), Some(GreaterThan(y))), Between(n, m) => (LowerThan(n), Some(GreaterThan(m))),
}
}
}
#[derive(Debug, Clone, PartialEq)]
pub enum FacetStringOperator {
Equal(String),
NotEqual(String),
}
impl FacetStringOperator {
fn equal(s: &str) -> Self {
FacetStringOperator::Equal(s.to_lowercase())
}
#[allow(dead_code)]
fn not_equal(s: &str) -> Self {
FacetStringOperator::equal(s).negate()
}
fn negate(self) -> Self {
match self {
FacetStringOperator::Equal(x) => FacetStringOperator::NotEqual(x),
FacetStringOperator::NotEqual(x) => FacetStringOperator::Equal(x),
} }
} }
} }
#[derive(Debug, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq)]
pub enum FacetCondition { pub enum FacetCondition {
OperatorString(FieldId, FacetStringOperator), Operator(FieldId, Operator),
OperatorNumber(FieldId, FacetNumberOperator),
Or(Box<Self>, Box<Self>), Or(Box<Self>, Box<Self>),
And(Box<Self>, Box<Self>), And(Box<Self>, Box<Self>),
} }
fn get_field_id_facet_type<'a>( fn field_id(
fields_ids_map: &FieldsIdsMap, fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<FieldId, FacetType>, faceted_fields: &HashSet<FieldId>,
items: &mut Pairs<'a, Rule>, items: &mut Pairs<Rule>,
) -> Result<(FieldId, FacetType), PestError<Rule>> ) -> Result<FieldId, PestError<Rule>>
{ {
// lexing ensures that we at least have a key // lexing ensures that we at least have a key
let key = items.next().unwrap(); let key = items.next().unwrap();
let field_id = fields_ids_map
.id(key.as_str())
.ok_or_else(|| {
PestError::new_from_span(
ErrorVariant::CustomError {
message: format!(
"attribute `{}` not found, available attributes are: {}",
key.as_str(),
fields_ids_map.iter().map(|(_, n)| n).collect::<Vec<_>>().join(", ")
),
},
key.as_span(),
)
})?;
let facet_type = faceted_fields let field_id = match fields_ids_map.id(key.as_str()) {
.get(&field_id) Some(field_id) => field_id,
.copied() None => return Err(PestError::new_from_span(
.ok_or_else(|| { ErrorVariant::CustomError {
PestError::new_from_span( message: format!(
ErrorVariant::CustomError { "attribute `{}` not found, available attributes are: {}",
message: format!( key.as_str(),
"attribute `{}` is not faceted, available faceted attributes are: {}", fields_ids_map.iter().map(|(_, n)| n).collect::<Vec<_>>().join(", "),
key.as_str(), ),
faceted_fields.keys().flat_map(|id| fields_ids_map.name(*id)).collect::<Vec<_>>().join(", ") },
), key.as_span(),
}, )),
key.as_span(), };
)
})?;
Ok((field_id, facet_type)) if !faceted_fields.contains(&field_id) {
return Err(PestError::new_from_span(
ErrorVariant::CustomError {
message: format!(
"attribute `{}` is not faceted, available faceted attributes are: {}",
key.as_str(),
faceted_fields.iter().flat_map(|id| {
fields_ids_map.name(*id)
}).collect::<Vec<_>>().join(", "),
),
},
key.as_span(),
));
}
Ok(field_id)
} }
fn pest_parse<T>(pair: Pair<Rule>) -> Result<T, pest::error::Error<Rule>> fn pest_parse<T>(pair: Pair<Rule>) -> (Result<T, pest::error::Error<Rule>>, String)
where T: FromStr, where T: FromStr,
T::Err: ToString, T::Err: ToString,
{ {
match pair.as_str().parse() { let result = match pair.as_str().parse::<T>() {
Ok(value) => Ok(value), Ok(value) => Ok(value),
Err(e) => { Err(e) => Err(PestError::<Rule>::new_from_span(
Err(PestError::<Rule>::new_from_span( ErrorVariant::CustomError { message: e.to_string() },
ErrorVariant::CustomError { message: e.to_string() }, pair.as_span(),
pair.as_span(), )),
)) };
}
} (result, pair.as_str().to_string())
} }
impl FacetCondition { impl FacetCondition {
@ -150,34 +122,6 @@ impl FacetCondition {
A: AsRef<str>, A: AsRef<str>,
B: AsRef<str>, B: AsRef<str>,
{ {
fn facet_condition(
fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<String, FacetType>,
key: &str,
value: &str,
) -> anyhow::Result<FacetCondition>
{
let fid = fields_ids_map.id(key).with_context(|| {
format!("{:?} isn't present in the fields ids map", key)
})?;
let ftype = faceted_fields.get(key).copied().with_context(|| {
format!("{:?} isn't a faceted field", key)
})?;
let (neg, value) = match value.trim().strip_prefix('-') {
Some(value) => (true, value.trim()),
None => (false, value.trim()),
};
let operator = match ftype {
FacetType::String => OperatorString(fid, FacetStringOperator::equal(value)),
FacetType::Number => OperatorNumber(fid, FacetNumberOperator::Equal(value.parse()?)),
};
if neg { Ok(operator.negate()) } else { Ok(operator) }
}
let fields_ids_map = index.fields_ids_map(rtxn)?;
let faceted_fields = index.faceted_fields(rtxn)?;
let mut ands = None; let mut ands = None;
for either in array { for either in array {
@ -185,10 +129,7 @@ impl FacetCondition {
Either::Left(array) => { Either::Left(array) => {
let mut ors = None; let mut ors = None;
for rule in array { for rule in array {
let mut iter = rule.as_ref().splitn(2, ':'); let condition = FacetCondition::from_str(rtxn, index, rule.as_ref())?;
let key = iter.next().context("missing facet condition key")?;
let value = iter.next().context("missing facet condition value")?;
let condition = facet_condition(&fields_ids_map, &faceted_fields, key, value)?;
ors = match ors.take() { ors = match ors.take() {
Some(ors) => Some(Or(Box::new(ors), Box::new(condition))), Some(ors) => Some(Or(Box::new(ors), Box::new(condition))),
None => Some(condition), None => Some(condition),
@ -203,10 +144,7 @@ impl FacetCondition {
} }
}, },
Either::Right(rule) => { Either::Right(rule) => {
let mut iter = rule.as_ref().splitn(2, ':'); let condition = FacetCondition::from_str(rtxn, index, rule.as_ref())?;
let key = iter.next().context("missing facet condition key")?;
let value = iter.next().context("missing facet condition value")?;
let condition = facet_condition(&fields_ids_map, &faceted_fields, key, value)?;
ands = match ands.take() { ands = match ands.take() {
Some(ands) => Some(And(Box::new(ands), Box::new(condition))), Some(ands) => Some(And(Box::new(ands), Box::new(condition))),
None => Some(condition), None => Some(condition),
@ -232,7 +170,7 @@ impl FacetCondition {
fn from_pairs( fn from_pairs(
fim: &FieldsIdsMap, fim: &FieldsIdsMap,
ff: &HashMap<FieldId, FacetType>, ff: &HashSet<FieldId>,
expression: Pairs<Rule>, expression: Pairs<Rule>,
) -> anyhow::Result<Self> ) -> anyhow::Result<Self>
{ {
@ -263,10 +201,9 @@ impl FacetCondition {
fn negate(self) -> FacetCondition { fn negate(self) -> FacetCondition {
match self { match self {
OperatorString(fid, op) => OperatorString(fid, op.negate()), Operator(fid, op) => match op.negate() {
OperatorNumber(fid, op) => match op.negate() { (op, None) => Operator(fid, op),
(op, None) => OperatorNumber(fid, op), (a, Some(b)) => Or(Box::new(Operator(fid, a)), Box::new(Operator(fid, b))),
(a, Some(b)) => Or(Box::new(OperatorNumber(fid, a)), Box::new(OperatorNumber(fid, b))),
}, },
Or(a, b) => And(Box::new(a.negate()), Box::new(b.negate())), Or(a, b) => And(Box::new(a.negate()), Box::new(b.negate())),
And(a, b) => Or(Box::new(a.negate()), Box::new(b.negate())), And(a, b) => Or(Box::new(a.negate()), Box::new(b.negate())),
@ -275,137 +212,96 @@ impl FacetCondition {
fn between( fn between(
fields_ids_map: &FieldsIdsMap, fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<FieldId, FacetType>, faceted_fields: &HashSet<FieldId>,
item: Pair<Rule>, item: Pair<Rule>,
) -> anyhow::Result<FacetCondition> ) -> anyhow::Result<FacetCondition>
{ {
let item_span = item.as_span();
let mut items = item.into_inner(); let mut items = item.into_inner();
let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; let fid = field_id(fields_ids_map, faceted_fields, &mut items)?;
let lvalue = items.next().unwrap();
let rvalue = items.next().unwrap(); let (lresult, _) = pest_parse(items.next().unwrap());
match ftype { let (rresult, _) = pest_parse(items.next().unwrap());
FacetType::String => {
Err(PestError::<Rule>::new_from_span( let lvalue = lresult?;
ErrorVariant::CustomError { let rvalue = rresult?;
message: "invalid operator on a faceted string".to_string(),
}, Ok(Operator(fid, Between(lvalue, rvalue)))
item_span,
).into())
},
FacetType::Number => {
let lvalue = pest_parse(lvalue)?;
let rvalue = pest_parse(rvalue)?;
Ok(OperatorNumber(fid, Between(lvalue, rvalue)))
},
}
} }
fn equal( fn equal(
fields_ids_map: &FieldsIdsMap, fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<FieldId, FacetType>, faceted_fields: &HashSet<FieldId>,
item: Pair<Rule>, item: Pair<Rule>,
) -> anyhow::Result<FacetCondition> ) -> anyhow::Result<FacetCondition>
{ {
let mut items = item.into_inner(); let mut items = item.into_inner();
let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; let fid = field_id(fields_ids_map, faceted_fields, &mut items)?;
let value = items.next().unwrap(); let value = items.next().unwrap();
match ftype { let (result, svalue) = pest_parse(value);
FacetType::String => Ok(OperatorString(fid, FacetStringOperator::equal(value.as_str()))),
FacetType::Number => Ok(OperatorNumber(fid, Equal(pest_parse(value)?))), let svalue = svalue.to_lowercase();
} Ok(Operator(fid, Equal(result.ok(), svalue)))
} }
fn greater_than( fn greater_than(
fields_ids_map: &FieldsIdsMap, fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<FieldId, FacetType>, faceted_fields: &HashSet<FieldId>,
item: Pair<Rule>, item: Pair<Rule>,
) -> anyhow::Result<FacetCondition> ) -> anyhow::Result<FacetCondition>
{ {
let item_span = item.as_span();
let mut items = item.into_inner(); let mut items = item.into_inner();
let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; let fid = field_id(fields_ids_map, faceted_fields, &mut items)?;
let value = items.next().unwrap(); let value = items.next().unwrap();
match ftype { let (result, _svalue) = pest_parse(value);
FacetType::String => {
Err(PestError::<Rule>::new_from_span( Ok(Operator(fid, GreaterThan(result?)))
ErrorVariant::CustomError {
message: "invalid operator on a faceted string".to_string(),
},
item_span,
).into())
},
FacetType::Number => Ok(OperatorNumber(fid, GreaterThan(pest_parse(value)?))),
}
} }
fn greater_than_or_equal( fn greater_than_or_equal(
fields_ids_map: &FieldsIdsMap, fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<FieldId, FacetType>, faceted_fields: &HashSet<FieldId>,
item: Pair<Rule>, item: Pair<Rule>,
) -> anyhow::Result<FacetCondition> ) -> anyhow::Result<FacetCondition>
{ {
let item_span = item.as_span();
let mut items = item.into_inner(); let mut items = item.into_inner();
let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; let fid = field_id(fields_ids_map, faceted_fields, &mut items)?;
let value = items.next().unwrap(); let value = items.next().unwrap();
match ftype { let (result, _svalue) = pest_parse(value);
FacetType::String => {
Err(PestError::<Rule>::new_from_span( Ok(Operator(fid, GreaterThanOrEqual(result?)))
ErrorVariant::CustomError {
message: "invalid operator on a faceted string".to_string(),
},
item_span,
).into())
},
FacetType::Number => Ok(OperatorNumber(fid, GreaterThanOrEqual(pest_parse(value)?))),
}
} }
fn lower_than( fn lower_than(
fields_ids_map: &FieldsIdsMap, fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<FieldId, FacetType>, faceted_fields: &HashSet<FieldId>,
item: Pair<Rule>, item: Pair<Rule>,
) -> anyhow::Result<FacetCondition> ) -> anyhow::Result<FacetCondition>
{ {
let item_span = item.as_span();
let mut items = item.into_inner(); let mut items = item.into_inner();
let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; let fid = field_id(fields_ids_map, faceted_fields, &mut items)?;
let value = items.next().unwrap(); let value = items.next().unwrap();
match ftype { let (result, _svalue) = pest_parse(value);
FacetType::String => {
Err(PestError::<Rule>::new_from_span( Ok(Operator(fid, LowerThan(result?)))
ErrorVariant::CustomError {
message: "invalid operator on a faceted string".to_string(),
},
item_span,
).into())
},
FacetType::Number => Ok(OperatorNumber(fid, LowerThan(pest_parse(value)?))),
}
} }
fn lower_than_or_equal( fn lower_than_or_equal(
fields_ids_map: &FieldsIdsMap, fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<FieldId, FacetType>, faceted_fields: &HashSet<FieldId>,
item: Pair<Rule>, item: Pair<Rule>,
) -> anyhow::Result<FacetCondition> ) -> anyhow::Result<FacetCondition>
{ {
let item_span = item.as_span();
let mut items = item.into_inner(); let mut items = item.into_inner();
let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; let fid = field_id(fields_ids_map, faceted_fields, &mut items)?;
let value = items.next().unwrap(); let value = items.next().unwrap();
match ftype { let (result, _svalue) = pest_parse(value);
FacetType::String => {
Err(PestError::<Rule>::new_from_span( Ok(Operator(fid, LowerThanOrEqual(result?)))
ErrorVariant::CustomError {
message: "invalid operator on a faceted string".to_string(),
},
item_span,
).into())
},
FacetType::Number => Ok(OperatorNumber(fid, LowerThanOrEqual(pest_parse(value)?))),
}
} }
} }
@ -485,34 +381,53 @@ impl FacetCondition {
Ok(()) Ok(())
} }
fn evaluate_number_operator<>( fn evaluate_operator(
rtxn: &heed::RoTxn, rtxn: &heed::RoTxn,
index: &Index, index: &Index,
db: heed::Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>, numbers_db: heed::Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
strings_db: heed::Database<FacetValueStringCodec, CboRoaringBitmapCodec>,
field_id: FieldId, field_id: FieldId,
operator: FacetNumberOperator, operator: &Operator,
) -> anyhow::Result<RoaringBitmap> ) -> anyhow::Result<RoaringBitmap>
{ {
// Make sure we always bound the ranges with the field id and the level, // Make sure we always bound the ranges with the field id and the level,
// as the facets values are all in the same database and prefixed by the // as the facets values are all in the same database and prefixed by the
// field id and the level. // field id and the level.
let (left, right) = match operator { let (left, right) = match operator {
GreaterThan(val) => (Excluded(val), Included(f64::MAX)), GreaterThan(val) => (Excluded(*val), Included(f64::MAX)),
GreaterThanOrEqual(val) => (Included(val), Included(f64::MAX)), GreaterThanOrEqual(val) => (Included(*val), Included(f64::MAX)),
Equal(val) => (Included(val), Included(val)), Equal(number, string) => {
NotEqual(val) => { let string_docids = strings_db.get(rtxn, &(field_id, &string))?.unwrap_or_default();
let all_documents_ids = index.faceted_documents_ids(rtxn, field_id)?; let number_docids = match number {
let docids = Self::evaluate_number_operator(rtxn, index, db, field_id, Equal(val))?; Some(n) => {
return Ok(all_documents_ids - docids); let n = Included(*n);
let mut output = RoaringBitmap::new();
Self::explore_facet_number_levels(rtxn, numbers_db, field_id, 0, n, n, &mut output)?;
output
},
None => RoaringBitmap::new(),
};
return Ok(string_docids | number_docids);
}, },
LowerThan(val) => (Included(f64::MIN), Excluded(val)), NotEqual(number, string) => {
LowerThanOrEqual(val) => (Included(f64::MIN), Included(val)), let all_numbers_ids = if number.is_some() {
Between(left, right) => (Included(left), Included(right)), index.number_faceted_documents_ids(rtxn, field_id)?
} else {
RoaringBitmap::new()
};
let all_strings_ids = index.string_faceted_documents_ids(rtxn, field_id)?;
let operator = Equal(*number, string.clone());
let docids = Self::evaluate_operator(rtxn, index, numbers_db, strings_db, field_id, &operator)?;
return Ok((all_numbers_ids | all_strings_ids) - docids);
},
LowerThan(val) => (Included(f64::MIN), Excluded(*val)),
LowerThanOrEqual(val) => (Included(f64::MIN), Included(*val)),
Between(left, right) => (Included(*left), Included(*right)),
}; };
// Ask for the biggest value that can exist for this specific field, if it exists // Ask for the biggest value that can exist for this specific field, if it exists
// that's fine if it don't, the value just before will be returned instead. // that's fine if it don't, the value just before will be returned instead.
let biggest_level = db let biggest_level = numbers_db
.remap_data_type::<DecodeIgnore>() .remap_data_type::<DecodeIgnore>()
.get_lower_than_or_equal_to(rtxn, &(field_id, u8::MAX, f64::MAX, f64::MAX))? .get_lower_than_or_equal_to(rtxn, &(field_id, u8::MAX, f64::MAX, f64::MAX))?
.and_then(|((id, level, _, _), _)| if id == field_id { Some(level) } else { None }); .and_then(|((id, level, _, _), _)| if id == field_id { Some(level) } else { None });
@ -520,52 +435,25 @@ impl FacetCondition {
match biggest_level { match biggest_level {
Some(level) => { Some(level) => {
let mut output = RoaringBitmap::new(); let mut output = RoaringBitmap::new();
Self::explore_facet_number_levels(rtxn, db, field_id, level, left, right, &mut output)?; Self::explore_facet_number_levels(rtxn, numbers_db, field_id, level, left, right, &mut output)?;
Ok(output) Ok(output)
}, },
None => Ok(RoaringBitmap::new()), None => Ok(RoaringBitmap::new()),
} }
} }
fn evaluate_string_operator(
rtxn: &heed::RoTxn,
index: &Index,
db: heed::Database<FacetValueStringCodec, CboRoaringBitmapCodec>,
field_id: FieldId,
operator: &FacetStringOperator,
) -> anyhow::Result<RoaringBitmap>
{
match operator {
FacetStringOperator::Equal(string) => {
match db.get(rtxn, &(field_id, string))? {
Some(docids) => Ok(docids),
None => Ok(RoaringBitmap::new())
}
},
FacetStringOperator::NotEqual(string) => {
let all_documents_ids = index.faceted_documents_ids(rtxn, field_id)?;
let op = FacetStringOperator::Equal(string.clone());
let docids = Self::evaluate_string_operator(rtxn, index, db, field_id, &op)?;
Ok(all_documents_ids - docids)
},
}
}
pub fn evaluate( pub fn evaluate(
&self, &self,
rtxn: &heed::RoTxn, rtxn: &heed::RoTxn,
index: &Index, index: &Index,
) -> anyhow::Result<RoaringBitmap> ) -> anyhow::Result<RoaringBitmap>
{ {
let db = index.facet_field_id_value_docids; let numbers_db = index.facet_id_f64_docids;
let strings_db = index.facet_id_string_docids;
match self { match self {
OperatorString(fid, op) => { Operator(fid, op) => {
let db = db.remap_key_type::<FacetValueStringCodec>(); Self::evaluate_operator(rtxn, index, numbers_db, strings_db, *fid, op)
Self::evaluate_string_operator(rtxn, index, db, *fid, op)
},
OperatorNumber(fid, op) => {
let db = db.remap_key_type::<FacetLevelValueF64Codec>();
Self::evaluate_number_operator(rtxn, index, db, *fid, *op)
}, },
Or(lhs, rhs) => { Or(lhs, rhs) => {
let lhs = lhs.evaluate(rtxn, index)?; let lhs = lhs.evaluate(rtxn, index)?;
@ -586,7 +474,8 @@ mod tests {
use super::*; use super::*;
use crate::update::Settings; use crate::update::Settings;
use heed::EnvOpenOptions; use heed::EnvOpenOptions;
use maplit::hashmap; use maplit::hashset;
use big_s::S;
#[test] #[test]
fn string() { fn string() {
@ -598,22 +487,22 @@ mod tests {
// Set the faceted fields to be the channel. // Set the faceted fields to be the channel.
let mut wtxn = index.write_txn().unwrap(); let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, 0); let mut builder = Settings::new(&mut wtxn, &index, 0);
builder.set_faceted_fields(hashmap!{ "channel".into() => "string".into() }); builder.set_faceted_fields(hashset!{ S("channel") });
builder.execute(|_, _| ()).unwrap(); builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap(); wtxn.commit().unwrap();
// Test that the facet condition is correctly generated. // Test that the facet condition is correctly generated.
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
let condition = FacetCondition::from_str(&rtxn, &index, "channel = ponce").unwrap(); let condition = FacetCondition::from_str(&rtxn, &index, "channel = Ponce").unwrap();
let expected = OperatorString(0, FacetStringOperator::equal("Ponce")); let expected = Operator(0, Operator::Equal(None, S("ponce")));
assert_eq!(condition, expected); assert_eq!(condition, expected);
let condition = FacetCondition::from_str(&rtxn, &index, "channel != ponce").unwrap(); let condition = FacetCondition::from_str(&rtxn, &index, "channel != ponce").unwrap();
let expected = OperatorString(0, FacetStringOperator::not_equal("ponce")); let expected = Operator(0, Operator::NotEqual(None, S("ponce")));
assert_eq!(condition, expected); assert_eq!(condition, expected);
let condition = FacetCondition::from_str(&rtxn, &index, "NOT channel = ponce").unwrap(); let condition = FacetCondition::from_str(&rtxn, &index, "NOT channel = ponce").unwrap();
let expected = OperatorString(0, FacetStringOperator::not_equal("ponce")); let expected = Operator(0, Operator::NotEqual(None, S("ponce")));
assert_eq!(condition, expected); assert_eq!(condition, expected);
} }
@ -627,20 +516,20 @@ mod tests {
// Set the faceted fields to be the channel. // Set the faceted fields to be the channel.
let mut wtxn = index.write_txn().unwrap(); let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, 0); let mut builder = Settings::new(&mut wtxn, &index, 0);
builder.set_faceted_fields(hashmap!{ "timestamp".into() => "number".into() }); builder.set_faceted_fields(hashset!{ "timestamp".into() });
builder.execute(|_, _| ()).unwrap(); builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap(); wtxn.commit().unwrap();
// Test that the facet condition is correctly generated. // Test that the facet condition is correctly generated.
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
let condition = FacetCondition::from_str(&rtxn, &index, "timestamp 22 TO 44").unwrap(); let condition = FacetCondition::from_str(&rtxn, &index, "timestamp 22 TO 44").unwrap();
let expected = OperatorNumber(0, Between(22.0, 44.0)); let expected = Operator(0, Between(22.0, 44.0));
assert_eq!(condition, expected); assert_eq!(condition, expected);
let condition = FacetCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap(); let condition = FacetCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap();
let expected = Or( let expected = Or(
Box::new(OperatorNumber(0, LowerThan(22.0))), Box::new(Operator(0, LowerThan(22.0))),
Box::new(OperatorNumber(0, GreaterThan(44.0))), Box::new(Operator(0, GreaterThan(44.0))),
); );
assert_eq!(condition, expected); assert_eq!(condition, expected);
} }
@ -655,11 +544,8 @@ mod tests {
// Set the faceted fields to be the channel. // Set the faceted fields to be the channel.
let mut wtxn = index.write_txn().unwrap(); let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, 0); let mut builder = Settings::new(&mut wtxn, &index, 0);
builder.set_searchable_fields(vec!["channel".into(), "timestamp".into()]); // to keep the fields order builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order
builder.set_faceted_fields(hashmap!{ builder.set_faceted_fields(hashset!{ S("channel"), S("timestamp") });
"channel".into() => "string".into(),
"timestamp".into() => "number".into(),
});
builder.execute(|_, _| ()).unwrap(); builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap(); wtxn.commit().unwrap();
@ -670,10 +556,10 @@ mod tests {
"channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)", "channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)",
).unwrap(); ).unwrap();
let expected = Or( let expected = Or(
Box::new(OperatorString(0, FacetStringOperator::equal("gotaga"))), Box::new(Operator(0, Operator::Equal(None, S("gotaga")))),
Box::new(And( Box::new(And(
Box::new(OperatorNumber(1, Between(22.0, 44.0))), Box::new(Operator(1, Between(22.0, 44.0))),
Box::new(OperatorString(0, FacetStringOperator::not_equal("ponce"))), Box::new(Operator(0, Operator::NotEqual(None, S("ponce")))),
)) ))
); );
assert_eq!(condition, expected); assert_eq!(condition, expected);
@ -683,13 +569,13 @@ mod tests {
"channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)", "channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)",
).unwrap(); ).unwrap();
let expected = Or( let expected = Or(
Box::new(OperatorString(0, FacetStringOperator::equal("gotaga"))), Box::new(Operator(0, Operator::Equal(None, S("gotaga")))),
Box::new(Or( Box::new(Or(
Box::new(Or( Box::new(Or(
Box::new(OperatorNumber(1, LowerThan(22.0))), Box::new(Operator(1, LowerThan(22.0))),
Box::new(OperatorNumber(1, GreaterThan(44.0))), Box::new(Operator(1, GreaterThan(44.0))),
)), )),
Box::new(OperatorString(0, FacetStringOperator::equal("ponce"))), Box::new(Operator(0, Operator::Equal(None, S("ponce")))),
)), )),
); );
assert_eq!(condition, expected); assert_eq!(condition, expected);
@ -705,11 +591,8 @@ mod tests {
// Set the faceted fields to be the channel. // Set the faceted fields to be the channel.
let mut wtxn = index.write_txn().unwrap(); let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, 0); let mut builder = Settings::new(&mut wtxn, &index, 0);
builder.set_searchable_fields(vec!["channel".into(), "timestamp".into()]); // to keep the fields order builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order
builder.set_faceted_fields(hashmap!{ builder.set_faceted_fields(hashset!{ S("channel"), S("timestamp") });
"channel".into() => "string".into(),
"timestamp".into() => "number".into(),
});
builder.execute(|_, _| ()).unwrap(); builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap(); wtxn.commit().unwrap();
@ -717,7 +600,7 @@ mod tests {
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
let condition = FacetCondition::from_array( let condition = FacetCondition::from_array(
&rtxn, &index, &rtxn, &index,
vec![Either::Right("channel:gotaga"), Either::Left(vec!["timestamp:44", "channel:-ponce"])], vec![Either::Right("channel = gotaga"), Either::Left(vec!["timestamp = 44", "channel != ponce"])],
).unwrap().unwrap(); ).unwrap().unwrap();
let expected = FacetCondition::from_str( let expected = FacetCondition::from_str(
&rtxn, &index, &rtxn, &index,

View File

@ -3,12 +3,12 @@ use std::ops::Bound::Unbounded;
use std::{cmp, fmt}; use std::{cmp, fmt};
use anyhow::Context; use anyhow::Context;
use heed::BytesDecode; use heed::{Database, BytesDecode};
use heed::types::{ByteSlice, Unit};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use crate::facet::{FacetType, FacetValue}; use crate::facet::FacetType;
use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec}; use crate::heed_codec::facet::FacetValueStringCodec;
use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec};
use crate::search::facet::{FacetIter, FacetRange}; use crate::search::facet::{FacetIter, FacetRange};
use crate::{Index, FieldId, DocumentId}; use crate::{Index, FieldId, DocumentId};
@ -60,86 +60,81 @@ impl<'a> FacetDistribution<'a> {
/// There is a small amount of candidates OR we ask for facet string values so we /// There is a small amount of candidates OR we ask for facet string values so we
/// decide to iterate over the facet values of each one of them, one by one. /// decide to iterate over the facet values of each one of them, one by one.
fn facet_values_from_documents( fn facet_distribution_from_documents(
&self, &self,
field_id: FieldId, field_id: FieldId,
facet_type: FacetType, facet_type: FacetType,
candidates: &RoaringBitmap, candidates: &RoaringBitmap,
) -> heed::Result<BTreeMap<FacetValue, u64>> distribution: &mut BTreeMap<String, u64>,
) -> heed::Result<()>
{ {
fn fetch_facet_values<'t, KC, K: 't>( fn fetch_facet_values<'t, KC, K: 't>(
index: &Index,
rtxn: &'t heed::RoTxn, rtxn: &'t heed::RoTxn,
db: Database<KC, Unit>,
field_id: FieldId, field_id: FieldId,
candidates: &RoaringBitmap, candidates: &RoaringBitmap,
) -> heed::Result<BTreeMap<FacetValue, u64>> distribution: &mut BTreeMap<String, u64>,
) -> heed::Result<()>
where where
K: fmt::Display,
KC: BytesDecode<'t, DItem = (FieldId, DocumentId, K)>, KC: BytesDecode<'t, DItem = (FieldId, DocumentId, K)>,
K: Into<FacetValue>,
{ {
let mut facet_values = BTreeMap::new();
let mut key_buffer = vec![field_id]; let mut key_buffer = vec![field_id];
for docid in candidates.into_iter().take(CANDIDATES_THRESHOLD as usize) { for docid in candidates.into_iter().take(CANDIDATES_THRESHOLD as usize) {
key_buffer.truncate(1); key_buffer.truncate(1);
key_buffer.extend_from_slice(&docid.to_be_bytes()); key_buffer.extend_from_slice(&docid.to_be_bytes());
let iter = index.field_id_docid_facet_values let iter = db
.remap_key_type::<ByteSlice>()
.prefix_iter(rtxn, &key_buffer)? .prefix_iter(rtxn, &key_buffer)?
.remap_key_type::<KC>(); .remap_key_type::<KC>();
for result in iter { for result in iter {
let ((_, _, value), ()) = result?; let ((_, _, value), ()) = result?;
*facet_values.entry(value.into()).or_insert(0) += 1; *distribution.entry(value.to_string()).or_insert(0) += 1;
} }
} }
Ok(facet_values) Ok(())
} }
let index = self.index;
let rtxn = self.rtxn;
match facet_type { match facet_type {
FacetType::String => {
fetch_facet_values::<FieldDocIdFacetStringCodec, _>(index, rtxn, field_id, candidates)
},
FacetType::Number => { FacetType::Number => {
fetch_facet_values::<FieldDocIdFacetF64Codec, _>(index, rtxn, field_id, candidates) let db = self.index.field_id_docid_facet_f64s;
fetch_facet_values(self.rtxn, db, field_id, candidates, distribution)
}, },
FacetType::String => {
let db = self.index.field_id_docid_facet_strings;
fetch_facet_values(self.rtxn, db, field_id, candidates, distribution)
}
} }
} }
/// There is too much documents, we use the facet levels to move throught /// There is too much documents, we use the facet levels to move throught
/// the facet values, to find the candidates and values associated. /// the facet values, to find the candidates and values associated.
fn facet_values_from_facet_levels( fn facet_numbers_distribution_from_facet_levels(
&self, &self,
field_id: FieldId, field_id: FieldId,
facet_type: FacetType,
candidates: &RoaringBitmap, candidates: &RoaringBitmap,
) -> heed::Result<BTreeMap<FacetValue, u64>> distribution: &mut BTreeMap<String, u64>,
) -> heed::Result<()>
{ {
let iter = match facet_type { let iter = FacetIter::new_non_reducing(
FacetType::String => unreachable!(), self.rtxn, self.index, field_id, candidates.clone(),
FacetType::Number => { )?;
let iter = FacetIter::new_non_reducing(
self.rtxn, self.index, field_id, candidates.clone(),
)?;
iter.map(|r| r.map(|(v, docids)| (FacetValue::from(v), docids)))
},
};
let mut facet_values = BTreeMap::new();
for result in iter { for result in iter {
let (value, mut docids) = result?; let (value, mut docids) = result?;
docids.intersect_with(candidates); docids.intersect_with(candidates);
if !docids.is_empty() { if !docids.is_empty() {
facet_values.insert(value, docids.len()); distribution.insert(value.to_string(), docids.len());
} }
if facet_values.len() == self.max_values_by_facet { if distribution.len() == self.max_values_by_facet {
break; break;
} }
} }
Ok(facet_values) Ok(())
} }
/// Placeholder search, a.k.a. no candidates were specified. We iterate throught the /// Placeholder search, a.k.a. no candidates were specified. We iterate throught the
@ -147,80 +142,73 @@ impl<'a> FacetDistribution<'a> {
fn facet_values_from_raw_facet_database( fn facet_values_from_raw_facet_database(
&self, &self,
field_id: FieldId, field_id: FieldId,
facet_type: FacetType, ) -> heed::Result<BTreeMap<String, u64>>
) -> heed::Result<BTreeMap<FacetValue, u64>>
{ {
let db = self.index.facet_field_id_value_docids; let mut distribution = BTreeMap::new();
let level = 0;
let iter = match facet_type {
FacetType::String => {
let iter = db
.prefix_iter(self.rtxn, &[field_id])?
.remap_key_type::<FacetValueStringCodec>()
.map(|r| r.map(|((_, v), docids)| (FacetValue::from(v), docids)));
Box::new(iter) as Box::<dyn Iterator<Item=_>>
},
FacetType::Number => {
let db = db.remap_key_type::<FacetLevelValueF64Codec>();
let range = FacetRange::new(
self.rtxn, db, field_id, level, Unbounded, Unbounded,
)?;
Box::new(range.map(|r| r.map(|((_, _, v, _), docids)| (FacetValue::from(v), docids))))
},
};
let mut facet_values = BTreeMap::new(); let db = self.index.facet_id_f64_docids;
for result in iter { let range = FacetRange::new(self.rtxn, db, field_id, 0, Unbounded, Unbounded)?;
let (value, docids) = result?;
facet_values.insert(value, docids.len()); for result in range {
if facet_values.len() == self.max_values_by_facet { let ((_, _, value, _), docids) = result?;
distribution.insert(value.to_string(), docids.len());
if distribution.len() == self.max_values_by_facet {
break; break;
} }
} }
Ok(facet_values) let iter = self.index
.facet_id_string_docids
.remap_key_type::<ByteSlice>()
.prefix_iter(self.rtxn, &[field_id])?
.remap_key_type::<FacetValueStringCodec>();
for result in iter {
let ((_, value), docids) = result?;
distribution.insert(value.to_string(), docids.len());
if distribution.len() == self.max_values_by_facet {
break;
}
}
Ok(distribution)
} }
fn facet_values( fn facet_values(&self, field_id: FieldId) -> heed::Result<BTreeMap<String, u64>> {
&self, use FacetType::{Number, String};
field_id: FieldId,
facet_type: FacetType,
) -> heed::Result<BTreeMap<FacetValue, u64>>
{
if let Some(candidates) = self.candidates.as_ref() { if let Some(candidates) = self.candidates.as_ref() {
// Classic search, candidates were specified, we must return facet values only related // Classic search, candidates were specified, we must return facet values only related
// to those candidates. We also enter here for facet strings for performance reasons. // to those candidates. We also enter here for facet strings for performance reasons.
if candidates.len() <= CANDIDATES_THRESHOLD || facet_type == FacetType::String { let mut distribution = BTreeMap::new();
self.facet_values_from_documents(field_id, facet_type, candidates) if candidates.len() <= CANDIDATES_THRESHOLD {
self.facet_distribution_from_documents(field_id, Number, candidates, &mut distribution)?;
self.facet_distribution_from_documents(field_id, String, candidates, &mut distribution)?;
} else { } else {
self.facet_values_from_facet_levels(field_id, facet_type, candidates) self.facet_numbers_distribution_from_facet_levels(field_id, candidates, &mut distribution)?;
self.facet_distribution_from_documents(field_id, String, candidates, &mut distribution)?;
} }
Ok(distribution)
} else { } else {
self.facet_values_from_raw_facet_database(field_id, facet_type) self.facet_values_from_raw_facet_database(field_id)
} }
} }
pub fn execute(&self) -> anyhow::Result<BTreeMap<String, BTreeMap<FacetValue, u64>>> { pub fn execute(&self) -> anyhow::Result<BTreeMap<String, BTreeMap<String, u64>>> {
let fields_ids_map = self.index.fields_ids_map(self.rtxn)?; let fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
let faceted_fields = self.index.faceted_fields(self.rtxn)?; let faceted_fields = self.index.faceted_fields(self.rtxn)?;
let fields_ids: Vec<_> = match &self.facets {
Some(names) => names
.iter()
.filter_map(|n| faceted_fields.get(n).map(|t| (n.to_string(), *t)))
.collect(),
None => faceted_fields.into_iter().collect(),
};
let mut facets_values = BTreeMap::new(); let mut distribution = BTreeMap::new();
for (name, ftype) in fields_ids { for name in faceted_fields {
let fid = fields_ids_map.id(&name).with_context(|| { let fid = fields_ids_map.id(&name).with_context(|| {
format!("missing field name {:?} from the fields id map", name) format!("missing field name {:?} from the fields id map", name)
})?; })?;
let values = self.facet_values(fid, ftype)?; let values = self.facet_values(fid)?;
facets_values.insert(name, values); distribution.insert(name, values);
} }
Ok(facets_values) Ok(distribution)
} }
} }

View File

@ -9,7 +9,7 @@ use crate::heed_codec::CboRoaringBitmapCodec;
use crate::heed_codec::facet::FacetLevelValueF64Codec; use crate::heed_codec::facet::FacetLevelValueF64Codec;
use crate::{Index, FieldId}; use crate::{Index, FieldId};
pub use self::facet_condition::{FacetCondition, FacetNumberOperator, FacetStringOperator}; pub use self::facet_condition::{FacetCondition, Operator};
pub use self::facet_distribution::FacetDistribution; pub use self::facet_distribution::FacetDistribution;
mod facet_condition; mod facet_condition;
@ -140,7 +140,7 @@ impl<'t> FacetIter<'t> {
documents_ids: RoaringBitmap, documents_ids: RoaringBitmap,
) -> heed::Result<FacetIter<'t>> ) -> heed::Result<FacetIter<'t>>
{ {
let db = index.facet_field_id_value_docids.remap_key_type::<FacetLevelValueF64Codec>(); let db = index.facet_id_f64_docids.remap_key_type::<FacetLevelValueF64Codec>();
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
let level_iters = vec![(documents_ids, Left(highest_iter))]; let level_iters = vec![(documents_ids, Left(highest_iter))];
@ -157,7 +157,7 @@ impl<'t> FacetIter<'t> {
documents_ids: RoaringBitmap, documents_ids: RoaringBitmap,
) -> heed::Result<FacetIter<'t>> ) -> heed::Result<FacetIter<'t>>
{ {
let db = index.facet_field_id_value_docids.remap_key_type::<FacetLevelValueF64Codec>(); let db = index.facet_id_f64_docids.remap_key_type::<FacetLevelValueF64Codec>();
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
let highest_iter = FacetRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; let highest_iter = FacetRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
let level_iters = vec![(documents_ids, Right(highest_iter))]; let level_iters = vec![(documents_ids, Right(highest_iter))];
@ -175,7 +175,7 @@ impl<'t> FacetIter<'t> {
documents_ids: RoaringBitmap, documents_ids: RoaringBitmap,
) -> heed::Result<FacetIter<'t>> ) -> heed::Result<FacetIter<'t>>
{ {
let db = index.facet_field_id_value_docids.remap_key_type::<FacetLevelValueF64Codec>(); let db = index.facet_id_f64_docids.remap_key_type::<FacetLevelValueF64Codec>();
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
let level_iters = vec![(documents_ids, Left(highest_iter))]; let level_iters = vec![(documents_ids, Left(highest_iter))];

View File

@ -16,9 +16,7 @@ use distinct::{Distinct, DocIter, FacetDistinct, MapDistinct, NoopDistinct};
use crate::search::criteria::r#final::{Final, FinalResult}; use crate::search::criteria::r#final::{Final, FinalResult};
use crate::{Index, DocumentId}; use crate::{Index, DocumentId};
pub use self::facet::{ pub use self::facet::{FacetCondition, FacetDistribution, FacetIter, Operator};
FacetCondition, FacetDistribution, FacetIter, FacetNumberOperator, FacetStringOperator,
};
pub use self::query_tree::MatchingWords; pub use self::query_tree::MatchingWords;
use self::query_tree::QueryTreeBuilder; use self::query_tree::QueryTreeBuilder;
@ -143,15 +141,12 @@ impl<'a> Search<'a> {
let field_ids_map = self.index.fields_ids_map(self.rtxn)?; let field_ids_map = self.index.fields_ids_map(self.rtxn)?;
let id = field_ids_map.id(name).expect("distinct not present in field map"); let id = field_ids_map.id(name).expect("distinct not present in field map");
let faceted_fields = self.index.faceted_fields(self.rtxn)?; let faceted_fields = self.index.faceted_fields(self.rtxn)?;
match faceted_fields.get(name) { if faceted_fields.contains(name) {
Some(facet_type) => { let distinct = FacetDistinct::new(id, self.index, self.rtxn);
let distinct = FacetDistinct::new(id, self.index, self.rtxn, *facet_type); self.perform_sort(distinct, matching_words, criteria)
self.perform_sort(distinct, matching_words, criteria) } else {
} let distinct = MapDistinct::new(id, self.index, self.rtxn);
None => { self.perform_sort(distinct, matching_words, criteria)
let distinct = MapDistinct::new(id, self.index, self.rtxn);
self.perform_sort(distinct, matching_words, criteria)
}
} }
} }
} }

View File

@ -30,8 +30,10 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
word_prefix_pair_proximity_docids, word_prefix_pair_proximity_docids,
word_level_position_docids, word_level_position_docids,
word_prefix_level_position_docids, word_prefix_level_position_docids,
facet_field_id_value_docids, facet_id_f64_docids,
field_id_docid_facet_values, facet_id_string_docids,
field_id_docid_facet_f64s,
field_id_docid_facet_strings,
documents, documents,
} = self.index; } = self.index;
@ -47,8 +49,10 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
self.index.put_fields_distribution(self.wtxn, &FieldsDistribution::default())?; self.index.put_fields_distribution(self.wtxn, &FieldsDistribution::default())?;
// We clean all the faceted documents ids. // We clean all the faceted documents ids.
for (field_id, _) in faceted_fields { let empty = RoaringBitmap::default();
self.index.put_faceted_documents_ids(self.wtxn, field_id, &RoaringBitmap::default())?; for field_id in faceted_fields {
self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &empty)?;
self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &empty)?;
} }
// Clear the other databases. // Clear the other databases.
@ -59,8 +63,10 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
word_prefix_pair_proximity_docids.clear(self.wtxn)?; word_prefix_pair_proximity_docids.clear(self.wtxn)?;
word_level_position_docids.clear(self.wtxn)?; word_level_position_docids.clear(self.wtxn)?;
word_prefix_level_position_docids.clear(self.wtxn)?; word_prefix_level_position_docids.clear(self.wtxn)?;
facet_field_id_value_docids.clear(self.wtxn)?; facet_id_f64_docids.clear(self.wtxn)?;
field_id_docid_facet_values.clear(self.wtxn)?; facet_id_string_docids.clear(self.wtxn)?;
field_id_docid_facet_f64s.clear(self.wtxn)?;
field_id_docid_facet_strings.clear(self.wtxn)?;
documents.clear(self.wtxn)?; documents.clear(self.wtxn)?;
Ok(number_of_documents) Ok(number_of_documents)
@ -112,8 +118,10 @@ mod tests {
assert!(index.docid_word_positions.is_empty(&rtxn).unwrap()); assert!(index.docid_word_positions.is_empty(&rtxn).unwrap());
assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap()); assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap());
assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap()); assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap());
assert!(index.facet_field_id_value_docids.is_empty(&rtxn).unwrap()); assert!(index.facet_id_f64_docids.is_empty(&rtxn).unwrap());
assert!(index.field_id_docid_facet_values.is_empty(&rtxn).unwrap()); assert!(index.facet_id_string_docids.is_empty(&rtxn).unwrap());
assert!(index.field_id_docid_facet_f64s.is_empty(&rtxn).unwrap());
assert!(index.field_id_docid_facet_strings.is_empty(&rtxn).unwrap());
assert!(index.documents.is_empty(&rtxn).unwrap()); assert!(index.documents.is_empty(&rtxn).unwrap());
} }
} }

View File

@ -4,13 +4,12 @@ use std::collections::hash_map::Entry;
use anyhow::anyhow; use anyhow::anyhow;
use chrono::Utc; use chrono::Utc;
use fst::IntoStreamer; use fst::IntoStreamer;
use heed::types::ByteSlice; use heed::types::{ByteSlice, Unit};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use serde_json::Value; use serde_json::Value;
use crate::facet::FacetType; use crate::heed_codec::CboRoaringBitmapCodec;
use crate::{Index, BEU32, SmallString32, ExternalDocumentsIds}; use crate::{Index, DocumentId, FieldId, BEU32, SmallString32, ExternalDocumentsIds};
use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec};
use super::ClearDocuments; use super::ClearDocuments;
pub struct DeleteDocuments<'t, 'u, 'i> { pub struct DeleteDocuments<'t, 'u, 'i> {
@ -90,8 +89,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
word_prefix_pair_proximity_docids, word_prefix_pair_proximity_docids,
word_level_position_docids, word_level_position_docids,
word_prefix_level_position_docids, word_prefix_level_position_docids,
facet_field_id_value_docids, facet_id_f64_docids,
field_id_docid_facet_values, facet_id_string_docids,
field_id_docid_facet_f64s,
field_id_docid_facet_strings,
documents, documents,
} = self.index; } = self.index;
@ -285,52 +286,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
drop(iter); drop(iter);
// Remove the documents ids from the faceted documents ids.
let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
for (field_id, facet_type) in faceted_fields {
let mut docids = self.index.faceted_documents_ids(self.wtxn, field_id)?;
docids.difference_with(&self.documents_ids);
self.index.put_faceted_documents_ids(self.wtxn, field_id, &docids)?;
// We delete the entries that are part of the documents ids.
let iter = field_id_docid_facet_values.prefix_iter_mut(self.wtxn, &[field_id])?;
match facet_type {
FacetType::String => {
let mut iter = iter.remap_key_type::<FieldDocIdFacetStringCodec>();
while let Some(result) = iter.next() {
let ((_fid, docid, _value), ()) = result?;
if self.documents_ids.contains(docid) {
iter.del_current()?;
}
}
},
FacetType::Number => {
let mut iter = iter.remap_key_type::<FieldDocIdFacetF64Codec>();
while let Some(result) = iter.next() {
let ((_fid, docid, _value), ()) = result?;
if self.documents_ids.contains(docid) {
iter.del_current()?;
}
}
},
}
}
// We delete the documents ids that are under the facet field id values.
let mut iter = facet_field_id_value_docids.iter_mut(self.wtxn)?;
while let Some(result) = iter.next() {
let (bytes, mut docids) = result?;
let previous_len = docids.len();
docids.difference_with(&self.documents_ids);
if docids.is_empty() {
iter.del_current()?;
} else if docids.len() != previous_len {
iter.put_current(bytes, &docids)?;
}
}
drop(iter);
// We delete the documents ids that are under the word level position docids. // We delete the documents ids that are under the word level position docids.
let mut iter = word_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>(); let mut iter = word_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
while let Some(result) = iter.next() { while let Some(result) = iter.next() {
@ -361,10 +316,100 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
drop(iter); drop(iter);
// We delete the documents ids that are under the facet field id values.
remove_docids_from_facet_field_id_value_docids(
self.wtxn,
facet_id_f64_docids,
&self.documents_ids,
)?;
remove_docids_from_facet_field_id_value_docids(
self.wtxn,
facet_id_string_docids,
&self.documents_ids,
)?;
// Remove the documents ids from the faceted documents ids.
for field_id in self.index.faceted_fields_ids(self.wtxn)? {
// Remove docids from the number faceted documents ids
let mut docids = self.index.number_faceted_documents_ids(self.wtxn, field_id)?;
docids.difference_with(&self.documents_ids);
self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &docids)?;
remove_docids_from_field_id_docid_facet_value(
self.wtxn,
field_id_docid_facet_f64s,
field_id,
&self.documents_ids,
|(_fid, docid, _value)| docid,
)?;
// Remove docids from the string faceted documents ids
let mut docids = self.index.string_faceted_documents_ids(self.wtxn, field_id)?;
docids.difference_with(&self.documents_ids);
self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &docids)?;
remove_docids_from_field_id_docid_facet_value(
self.wtxn,
field_id_docid_facet_strings,
field_id,
&self.documents_ids,
|(_fid, docid, _value)| docid,
)?;
}
Ok(self.documents_ids.len()) Ok(self.documents_ids.len())
} }
} }
fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F>(
wtxn: &'a mut heed::RwTxn,
db: &heed::Database<C, Unit>,
field_id: FieldId,
to_remove: &RoaringBitmap,
convert: F,
) -> heed::Result<()>
where
C: heed::BytesDecode<'a, DItem=K> + heed::BytesEncode<'a, EItem=K>,
F: Fn(K) -> DocumentId,
{
let mut iter = db.remap_key_type::<ByteSlice>()
.prefix_iter_mut(wtxn, &[field_id])?
.remap_key_type::<C>();
while let Some(result) = iter.next() {
let (key, ()) = result?;
if to_remove.contains(convert(key)) {
iter.del_current()?;
}
}
Ok(())
}
fn remove_docids_from_facet_field_id_value_docids<'a, C>(
wtxn: &'a mut heed::RwTxn,
db: &heed::Database<C, CboRoaringBitmapCodec>,
to_remove: &RoaringBitmap,
) -> heed::Result<()>
where
C: heed::BytesDecode<'a> + heed::BytesEncode<'a>,
{
let mut iter = db.remap_key_type::<ByteSlice>().iter_mut(wtxn)?;
while let Some(result) = iter.next() {
let (bytes, mut docids) = result?;
let previous_len = docids.len();
docids.difference_with(to_remove);
if docids.is_empty() {
iter.del_current()?;
} else if docids.len() != previous_len {
iter.put_current(bytes, &docids)?;
}
}
Ok(())
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use heed::EnvOpenOptions; use heed::EnvOpenOptions;

View File

@ -9,7 +9,6 @@ use heed::{BytesEncode, Error};
use log::debug; use log::debug;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use crate::facet::FacetType;
use crate::heed_codec::CboRoaringBitmapCodec; use crate::heed_codec::CboRoaringBitmapCodec;
use crate::heed_codec::facet::FacetLevelValueF64Codec; use crate::heed_codec::facet::FacetLevelValueF64Codec;
use crate::Index; use crate::Index;
@ -62,56 +61,50 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
debug!("Computing and writing the facet values levels docids into LMDB on disk..."); debug!("Computing and writing the facet values levels docids into LMDB on disk...");
for (field_id, facet_type) in faceted_fields {
let (content, documents_ids) = match facet_type {
FacetType::String => {
let documents_ids = compute_faceted_documents_ids(
self.wtxn,
self.index.facet_field_id_value_docids,
field_id,
)?;
(None, documents_ids) for field_id in faceted_fields {
}, // Compute and store the faceted strings documents ids.
FacetType::Number => { let string_documents_ids = compute_faceted_documents_ids(
clear_field_number_levels( self.wtxn,
self.wtxn, self.index.facet_id_string_docids.remap_key_type::<ByteSlice>(),
self.index.facet_field_id_value_docids.remap_key_type::<FacetLevelValueF64Codec>(), field_id,
field_id, )?;
)?;
let documents_ids = compute_faceted_documents_ids( // Clear the facet number levels.
self.wtxn, clear_field_number_levels(
self.index.facet_field_id_value_docids, self.wtxn,
field_id, self.index.facet_id_f64_docids,
)?; field_id,
)?;
let content = compute_facet_number_levels( // Compute and store the faceted numbers documents ids.
self.wtxn, let number_documents_ids = compute_faceted_documents_ids(
self.index.facet_field_id_value_docids.remap_key_type::<FacetLevelValueF64Codec>(), self.wtxn,
self.chunk_compression_type, self.index.facet_id_f64_docids.remap_key_type::<ByteSlice>(),
self.chunk_compression_level, field_id,
self.chunk_fusing_shrink_size, )?;
self.level_group_size,
self.min_level_size,
field_id,
)?;
(Some(content), documents_ids) let content = compute_facet_number_levels(
}, self.wtxn,
}; self.index.facet_id_f64_docids,
self.chunk_compression_type,
self.chunk_compression_level,
self.chunk_fusing_shrink_size,
self.level_group_size,
self.min_level_size,
field_id,
)?;
if let Some(content) = content { self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &string_documents_ids)?;
write_into_lmdb_database( self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &number_documents_ids)?;
self.wtxn,
*self.index.facet_field_id_value_docids.as_polymorph(),
content,
|_, _| anyhow::bail!("invalid facet level merging"),
WriteMethod::GetMergePut,
)?;
}
self.index.put_faceted_documents_ids(self.wtxn, field_id, &documents_ids)?; write_into_lmdb_database(
self.wtxn,
*self.index.facet_id_f64_docids.as_polymorph(),
content,
|_, _| anyhow::bail!("invalid facet number level merging"),
WriteMethod::GetMergePut,
)?;
} }
Ok(()) Ok(())
@ -205,10 +198,12 @@ fn compute_faceted_documents_ids(
) -> anyhow::Result<RoaringBitmap> ) -> anyhow::Result<RoaringBitmap>
{ {
let mut documents_ids = RoaringBitmap::new(); let mut documents_ids = RoaringBitmap::new();
for result in db.prefix_iter(rtxn, &[field_id])? { for result in db.prefix_iter(rtxn, &[field_id])? {
let (_key, docids) = result?; let (_key, docids) = result?;
documents_ids.union_with(&docids); documents_ids |= docids;
} }
Ok(documents_ids) Ok(documents_ids)
} }

View File

@ -412,7 +412,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
Main, Main,
WordDocids, WordDocids,
WordLevel0PositionDocids, WordLevel0PositionDocids,
FacetLevel0ValuesDocids, FacetLevel0NumbersDocids,
} }
let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
@ -450,8 +450,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
.enumerate() .enumerate()
.map(|(i, documents)| { .map(|(i, documents)| {
let store = Store::new( let store = Store::new(
primary_key.clone(),
fields_ids_map.clone(),
searchable_fields.clone(), searchable_fields.clone(),
faceted_fields.clone(), faceted_fields.clone(),
linked_hash_map_size, linked_hash_map_size,
@ -478,8 +476,10 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
let mut docid_word_positions_readers = Vec::with_capacity(readers.len()); let mut docid_word_positions_readers = Vec::with_capacity(readers.len());
let mut words_pairs_proximities_docids_readers = Vec::with_capacity(readers.len()); let mut words_pairs_proximities_docids_readers = Vec::with_capacity(readers.len());
let mut word_level_position_docids_readers = Vec::with_capacity(readers.len()); let mut word_level_position_docids_readers = Vec::with_capacity(readers.len());
let mut facet_field_value_docids_readers = Vec::with_capacity(readers.len()); let mut facet_field_numbers_docids_readers = Vec::with_capacity(readers.len());
let mut field_id_docid_facet_values_readers = Vec::with_capacity(readers.len()); let mut facet_field_strings_docids_readers = Vec::with_capacity(readers.len());
let mut field_id_docid_facet_numbers_readers = Vec::with_capacity(readers.len());
let mut field_id_docid_facet_strings_readers = Vec::with_capacity(readers.len());
let mut documents_readers = Vec::with_capacity(readers.len()); let mut documents_readers = Vec::with_capacity(readers.len());
readers.into_iter().for_each(|readers| { readers.into_iter().for_each(|readers| {
let Readers { let Readers {
@ -488,17 +488,21 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
docid_word_positions, docid_word_positions,
words_pairs_proximities_docids, words_pairs_proximities_docids,
word_level_position_docids, word_level_position_docids,
facet_field_value_docids, facet_field_numbers_docids,
field_id_docid_facet_values, facet_field_strings_docids,
documents field_id_docid_facet_numbers,
field_id_docid_facet_strings,
documents,
} = readers; } = readers;
main_readers.push(main); main_readers.push(main);
word_docids_readers.push(word_docids); word_docids_readers.push(word_docids);
docid_word_positions_readers.push(docid_word_positions); docid_word_positions_readers.push(docid_word_positions);
words_pairs_proximities_docids_readers.push(words_pairs_proximities_docids); words_pairs_proximities_docids_readers.push(words_pairs_proximities_docids);
word_level_position_docids_readers.push(word_level_position_docids); word_level_position_docids_readers.push(word_level_position_docids);
facet_field_value_docids_readers.push(facet_field_value_docids); facet_field_numbers_docids_readers.push(facet_field_numbers_docids);
field_id_docid_facet_values_readers.push(field_id_docid_facet_values); facet_field_strings_docids_readers.push(facet_field_strings_docids);
field_id_docid_facet_numbers_readers.push(field_id_docid_facet_numbers);
field_id_docid_facet_strings_readers.push(field_id_docid_facet_strings);
documents_readers.push(documents); documents_readers.push(documents);
}); });
@ -523,8 +527,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
(DatabaseType::Main, main_readers, main_merge as MergeFn), (DatabaseType::Main, main_readers, main_merge as MergeFn),
(DatabaseType::WordDocids, word_docids_readers, word_docids_merge), (DatabaseType::WordDocids, word_docids_readers, word_docids_merge),
( (
DatabaseType::FacetLevel0ValuesDocids, DatabaseType::FacetLevel0NumbersDocids,
facet_field_value_docids_readers, facet_field_numbers_docids_readers,
facet_field_value_docids_merge, facet_field_value_docids_merge,
), ),
( (
@ -547,7 +551,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
docid_word_positions_readers, docid_word_positions_readers,
documents_readers, documents_readers,
words_pairs_proximities_docids_readers, words_pairs_proximities_docids_readers,
field_id_docid_facet_values_readers, facet_field_strings_docids_readers,
field_id_docid_facet_numbers_readers,
field_id_docid_facet_strings_readers,
)) as anyhow::Result<_> )) as anyhow::Result<_>
})?; })?;
@ -556,7 +562,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
docid_word_positions_readers, docid_word_positions_readers,
documents_readers, documents_readers,
words_pairs_proximities_docids_readers, words_pairs_proximities_docids_readers,
field_id_docid_facet_values_readers, facet_field_strings_docids_readers,
field_id_docid_facet_numbers_readers,
field_id_docid_facet_strings_readers,
) = readers; ) = readers;
let mut documents_ids = self.index.documents_ids(self.wtxn)?; let mut documents_ids = self.index.documents_ids(self.wtxn)?;
@ -587,7 +595,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
self.index.put_documents_ids(self.wtxn, &documents_ids)?; self.index.put_documents_ids(self.wtxn, &documents_ids)?;
let mut database_count = 0; let mut database_count = 0;
let total_databases = 8; let total_databases = 10;
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
databases_seen: 0, databases_seen: 0,
@ -624,11 +632,41 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
total_databases, total_databases,
}); });
debug!("Writing the field id docid facet values into LMDB on disk..."); debug!("Writing the facet id string docids into LMDB on disk...");
merge_into_lmdb_database( merge_into_lmdb_database(
self.wtxn, self.wtxn,
*self.index.field_id_docid_facet_values.as_polymorph(), *self.index.facet_id_string_docids.as_polymorph(),
field_id_docid_facet_values_readers, facet_field_strings_docids_readers,
facet_field_value_docids_merge,
write_method,
)?;
database_count += 1;
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
databases_seen: database_count,
total_databases,
});
debug!("Writing the field id docid facet numbers into LMDB on disk...");
merge_into_lmdb_database(
self.wtxn,
*self.index.field_id_docid_facet_f64s.as_polymorph(),
field_id_docid_facet_numbers_readers,
field_id_docid_facet_values_merge,
write_method,
)?;
database_count += 1;
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
databases_seen: database_count,
total_databases,
});
debug!("Writing the field id docid facet strings into LMDB on disk...");
merge_into_lmdb_database(
self.wtxn,
*self.index.field_id_docid_facet_strings.as_polymorph(),
field_id_docid_facet_strings_readers,
field_id_docid_facet_values_merge, field_id_docid_facet_values_merge,
write_method, write_method,
)?; )?;
@ -678,9 +716,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
write_method, write_method,
)?; )?;
}, },
DatabaseType::FacetLevel0ValuesDocids => { DatabaseType::FacetLevel0NumbersDocids => {
debug!("Writing the facet level 0 values docids into LMDB on disk..."); debug!("Writing the facet numbers docids into LMDB on disk...");
let db = *self.index.facet_field_id_value_docids.as_polymorph(); let db = *self.index.facet_id_f64_docids.as_polymorph();
write_into_lmdb_database( write_into_lmdb_database(
self.wtxn, self.wtxn,
db, db,

View File

@ -6,25 +6,24 @@ use std::iter::FromIterator;
use std::time::Instant; use std::time::Instant;
use std::{cmp, iter}; use std::{cmp, iter};
use anyhow::{bail, Context}; use anyhow::Context;
use bstr::ByteSlice as _; use bstr::ByteSlice as _;
use fst::Set; use fst::Set;
use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType}; use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType};
use heed::BytesEncode; use heed::BytesEncode;
use linked_hash_map::LinkedHashMap; use linked_hash_map::LinkedHashMap;
use log::{debug, info, warn}; use log::{debug, info};
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind, token::SeparatorKind}; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind, token::SeparatorKind};
use ordered_float::OrderedFloat; use ordered_float::OrderedFloat;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use serde_json::Value; use serde_json::Value;
use tempfile::tempfile; use tempfile::tempfile;
use crate::facet::{FacetType, FacetValue};
use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec}; use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec};
use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec}; use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec};
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
use crate::update::UpdateIndexingStep; use crate::update::UpdateIndexingStep;
use crate::{json_to_string, SmallVec8, SmallVec32, Position, DocumentId, FieldId, FieldsIdsMap}; use crate::{json_to_string, SmallVec32, Position, DocumentId, FieldId};
use super::{MergeFn, create_writer, create_sorter, writer_into_reader}; use super::{MergeFn, create_writer, create_sorter, writer_into_reader};
use super::merge_function::{ use super::merge_function::{
@ -45,23 +44,24 @@ pub struct Readers {
pub docid_word_positions: Reader<FileFuse>, pub docid_word_positions: Reader<FileFuse>,
pub words_pairs_proximities_docids: Reader<FileFuse>, pub words_pairs_proximities_docids: Reader<FileFuse>,
pub word_level_position_docids: Reader<FileFuse>, pub word_level_position_docids: Reader<FileFuse>,
pub facet_field_value_docids: Reader<FileFuse>, pub facet_field_numbers_docids: Reader<FileFuse>,
pub field_id_docid_facet_values: Reader<FileFuse>, pub facet_field_strings_docids: Reader<FileFuse>,
pub field_id_docid_facet_numbers: Reader<FileFuse>,
pub field_id_docid_facet_strings: Reader<FileFuse>,
pub documents: Reader<FileFuse>, pub documents: Reader<FileFuse>,
} }
pub struct Store<'s, A> { pub struct Store<'s, A> {
// Indexing parameters // Indexing parameters
primary_key: String,
fields_ids_map: FieldsIdsMap,
searchable_fields: HashSet<FieldId>, searchable_fields: HashSet<FieldId>,
faceted_fields: HashMap<FieldId, FacetType>, faceted_fields: HashSet<FieldId>,
// Caches // Caches
word_docids: LinkedHashMap<SmallVec32<u8>, RoaringBitmap>, word_docids: LinkedHashMap<SmallVec32<u8>, RoaringBitmap>,
word_docids_limit: usize, word_docids_limit: usize,
words_pairs_proximities_docids: LinkedHashMap<(SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap>, words_pairs_proximities_docids: LinkedHashMap<(SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap>,
words_pairs_proximities_docids_limit: usize, words_pairs_proximities_docids_limit: usize,
facet_field_value_docids: LinkedHashMap<(u8, FacetValue), RoaringBitmap>, facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat<f64>), RoaringBitmap>,
facet_field_string_docids: LinkedHashMap<(FieldId, String), RoaringBitmap>,
facet_field_value_docids_limit: usize, facet_field_value_docids_limit: usize,
// MTBL parameters // MTBL parameters
chunk_compression_type: CompressionType, chunk_compression_type: CompressionType,
@ -72,8 +72,10 @@ pub struct Store<'s, A> {
word_docids_sorter: Sorter<MergeFn>, word_docids_sorter: Sorter<MergeFn>,
words_pairs_proximities_docids_sorter: Sorter<MergeFn>, words_pairs_proximities_docids_sorter: Sorter<MergeFn>,
word_level_position_docids_sorter: Sorter<MergeFn>, word_level_position_docids_sorter: Sorter<MergeFn>,
facet_field_value_docids_sorter: Sorter<MergeFn>, facet_field_numbers_docids_sorter: Sorter<MergeFn>,
field_id_docid_facet_values_sorter: Sorter<MergeFn>, facet_field_strings_docids_sorter: Sorter<MergeFn>,
field_id_docid_facet_numbers_sorter: Sorter<MergeFn>,
field_id_docid_facet_strings_sorter: Sorter<MergeFn>,
// MTBL writers // MTBL writers
docid_word_positions_writer: Writer<File>, docid_word_positions_writer: Writer<File>,
documents_writer: Writer<File>, documents_writer: Writer<File>,
@ -83,10 +85,8 @@ pub struct Store<'s, A> {
impl<'s, A: AsRef<[u8]>> Store<'s, A> { impl<'s, A: AsRef<[u8]>> Store<'s, A> {
pub fn new( pub fn new(
primary_key: String,
fields_ids_map: FieldsIdsMap,
searchable_fields: HashSet<FieldId>, searchable_fields: HashSet<FieldId>,
faceted_fields: HashMap<FieldId, FacetType>, faceted_fields: HashSet<FieldId>,
linked_hash_map_size: Option<usize>, linked_hash_map_size: Option<usize>,
max_nb_chunks: Option<usize>, max_nb_chunks: Option<usize>,
max_memory: Option<usize>, max_memory: Option<usize>,
@ -132,7 +132,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
max_nb_chunks, max_nb_chunks,
max_memory, max_memory,
); );
let facet_field_value_docids_sorter = create_sorter( let facet_field_numbers_docids_sorter = create_sorter(
facet_field_value_docids_merge, facet_field_value_docids_merge,
chunk_compression_type, chunk_compression_type,
chunk_compression_level, chunk_compression_level,
@ -140,7 +140,23 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
max_nb_chunks, max_nb_chunks,
max_memory, max_memory,
); );
let field_id_docid_facet_values_sorter = create_sorter( let facet_field_strings_docids_sorter = create_sorter(
facet_field_value_docids_merge,
chunk_compression_type,
chunk_compression_level,
chunk_fusing_shrink_size,
max_nb_chunks,
max_memory,
);
let field_id_docid_facet_numbers_sorter = create_sorter(
field_id_docid_facet_values_merge,
chunk_compression_type,
chunk_compression_level,
chunk_fusing_shrink_size,
max_nb_chunks,
Some(1024 * 1024 * 1024), // 1MB
);
let field_id_docid_facet_strings_sorter = create_sorter(
field_id_docid_facet_values_merge, field_id_docid_facet_values_merge,
chunk_compression_type, chunk_compression_type,
chunk_compression_level, chunk_compression_level,
@ -164,8 +180,6 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
Ok(Store { Ok(Store {
// Indexing parameters. // Indexing parameters.
primary_key,
fields_ids_map,
searchable_fields, searchable_fields,
faceted_fields, faceted_fields,
// Caches // Caches
@ -173,7 +187,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
word_docids_limit: linked_hash_map_size, word_docids_limit: linked_hash_map_size,
words_pairs_proximities_docids: LinkedHashMap::with_capacity(linked_hash_map_size), words_pairs_proximities_docids: LinkedHashMap::with_capacity(linked_hash_map_size),
words_pairs_proximities_docids_limit: linked_hash_map_size, words_pairs_proximities_docids_limit: linked_hash_map_size,
facet_field_value_docids: LinkedHashMap::with_capacity(linked_hash_map_size), facet_field_number_docids: LinkedHashMap::with_capacity(linked_hash_map_size),
facet_field_string_docids: LinkedHashMap::with_capacity(linked_hash_map_size),
facet_field_value_docids_limit: linked_hash_map_size, facet_field_value_docids_limit: linked_hash_map_size,
// MTBL parameters // MTBL parameters
chunk_compression_type, chunk_compression_type,
@ -184,8 +199,10 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
word_docids_sorter, word_docids_sorter,
words_pairs_proximities_docids_sorter, words_pairs_proximities_docids_sorter,
word_level_position_docids_sorter, word_level_position_docids_sorter,
facet_field_value_docids_sorter, facet_field_numbers_docids_sorter,
field_id_docid_facet_values_sorter, facet_field_strings_docids_sorter,
field_id_docid_facet_numbers_sorter,
field_id_docid_facet_strings_sorter,
// MTBL writers // MTBL writers
docid_word_positions_writer, docid_word_positions_writer,
documents_writer, documents_writer,
@ -215,34 +232,68 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
Ok(()) Ok(())
} }
// Save the documents ids under the facet field id and value we have seen it. fn insert_facet_number_values_docid(
fn insert_facet_values_docid(
&mut self, &mut self,
field_id: FieldId, field_id: FieldId,
field_value: FacetValue, value: OrderedFloat<f64>,
id: DocumentId, id: DocumentId,
) -> anyhow::Result<()> ) -> anyhow::Result<()>
{ {
Self::write_field_id_docid_facet_value(&mut self.field_id_docid_facet_values_sorter, field_id, id, &field_value)?; let sorter = &mut self.field_id_docid_facet_numbers_sorter;
Self::write_field_id_docid_facet_number_value(sorter, field_id, id, value)?;
let key = (field_id, field_value); let key = (field_id, value);
// if get_refresh finds the element it is assured to be at the end of the linked hash map. // if get_refresh finds the element it is assured to be at the end of the linked hash map.
match self.facet_field_value_docids.get_refresh(&key) { match self.facet_field_number_docids.get_refresh(&key) {
Some(old) => { old.insert(id); }, Some(old) => { old.insert(id); },
None => { None => {
// A newly inserted element is append at the end of the linked hash map. // A newly inserted element is append at the end of the linked hash map.
self.facet_field_value_docids.insert(key, RoaringBitmap::from_iter(Some(id))); self.facet_field_number_docids.insert(key, RoaringBitmap::from_iter(Some(id)));
// If the word docids just reached it's capacity we must make sure to remove // If the word docids just reached it's capacity we must make sure to remove
// one element, this way next time we insert we doesn't grow the capacity. // one element, this way next time we insert we doesn't grow the capacity.
if self.facet_field_value_docids.len() == self.facet_field_value_docids_limit { if self.facet_field_number_docids.len() == self.facet_field_value_docids_limit {
// Removing the front element is equivalent to removing the LRU element. // Removing the front element is equivalent to removing the LRU element.
Self::write_facet_field_value_docids( Self::write_facet_field_number_docids(
&mut self.facet_field_value_docids_sorter, &mut self.facet_field_numbers_docids_sorter,
self.facet_field_value_docids.pop_front(), self.facet_field_number_docids.pop_front(),
)?; )?;
} }
} }
} }
Ok(())
}
// Save the documents ids under the facet field id and value we have seen it.
fn insert_facet_string_values_docid(
&mut self,
field_id: FieldId,
value: String,
id: DocumentId,
) -> anyhow::Result<()>
{
let sorter = &mut self.field_id_docid_facet_strings_sorter;
Self::write_field_id_docid_facet_string_value(sorter, field_id, id, &value)?;
let key = (field_id, value);
// if get_refresh finds the element it is assured to be at the end of the linked hash map.
match self.facet_field_string_docids.get_refresh(&key) {
Some(old) => { old.insert(id); },
None => {
// A newly inserted element is append at the end of the linked hash map.
self.facet_field_string_docids.insert(key, RoaringBitmap::from_iter(Some(id)));
// If the word docids just reached it's capacity we must make sure to remove
// one element, this way next time we insert we doesn't grow the capacity.
if self.facet_field_string_docids.len() == self.facet_field_value_docids_limit {
// Removing the front element is equivalent to removing the LRU element.
Self::write_facet_field_string_docids(
&mut self.facet_field_strings_docids_sorter,
self.facet_field_string_docids.pop_front(),
)?;
}
}
}
Ok(()) Ok(())
} }
@ -287,7 +338,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
&mut self, &mut self,
document_id: DocumentId, document_id: DocumentId,
words_positions: &mut HashMap<String, SmallVec32<Position>>, words_positions: &mut HashMap<String, SmallVec32<Position>>,
facet_values: &mut HashMap<FieldId, SmallVec8<FacetValue>>, facet_numbers_values: &mut HashMap<FieldId, Vec<f64>>,
facet_strings_values: &mut HashMap<FieldId, Vec<String>>,
record: &[u8], record: &[u8],
) -> anyhow::Result<()> ) -> anyhow::Result<()>
{ {
@ -306,10 +358,18 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
words_positions.clear(); words_positions.clear();
// We store document_id associated with all the field id and values. // We store document_id associated with all the facet numbers fields ids and values.
for (field, values) in facet_values.drain() { for (field, values) in facet_numbers_values.drain() {
for value in values { for value in values {
self.insert_facet_values_docid(field, value, document_id)?; let value = OrderedFloat::from(value);
self.insert_facet_number_values_docid(field, value, document_id)?;
}
}
// We store document_id associated with all the facet strings fields ids and values.
for (field, values) in facet_strings_values.drain() {
for value in values {
self.insert_facet_string_values_docid(field, value, document_id)?;
} }
} }
@ -409,20 +469,16 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
Ok(()) Ok(())
} }
fn write_facet_field_value_docids<I>( fn write_facet_field_string_docids<I>(
sorter: &mut Sorter<MergeFn>, sorter: &mut Sorter<MergeFn>,
iter: I, iter: I,
) -> anyhow::Result<()> ) -> anyhow::Result<()>
where I: IntoIterator<Item=((FieldId, FacetValue), RoaringBitmap)> where I: IntoIterator<Item=((FieldId, String), RoaringBitmap)>
{ {
use FacetValue::*;
for ((field_id, value), docids) in iter { for ((field_id, value), docids) in iter {
let result = match value { let key = FacetValueStringCodec::bytes_encode(&(field_id, &value))
String(s) => FacetValueStringCodec::bytes_encode(&(field_id, &s)).map(Cow::into_owned), .map(Cow::into_owned)
Number(f) => FacetLevelValueF64Codec::bytes_encode(&(field_id, 0, *f, *f)).map(Cow::into_owned), .context("could not serialize facet key")?;
};
let key = result.context("could not serialize facet key")?;
let bytes = CboRoaringBitmapCodec::bytes_encode(&docids) let bytes = CboRoaringBitmapCodec::bytes_encode(&docids)
.context("could not serialize docids")?; .context("could not serialize docids")?;
if lmdb_key_valid_size(&key) { if lmdb_key_valid_size(&key) {
@ -433,21 +489,55 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
Ok(()) Ok(())
} }
fn write_field_id_docid_facet_value( fn write_facet_field_number_docids<I>(
sorter: &mut Sorter<MergeFn>,
iter: I,
) -> anyhow::Result<()>
where I: IntoIterator<Item=((FieldId, OrderedFloat<f64>), RoaringBitmap)>
{
for ((field_id, value), docids) in iter {
let key = FacetLevelValueF64Codec::bytes_encode(&(field_id, 0, *value, *value))
.map(Cow::into_owned)
.context("could not serialize facet key")?;
let bytes = CboRoaringBitmapCodec::bytes_encode(&docids)
.context("could not serialize docids")?;
if lmdb_key_valid_size(&key) {
sorter.insert(&key, &bytes)?;
}
}
Ok(())
}
fn write_field_id_docid_facet_number_value(
sorter: &mut Sorter<MergeFn>, sorter: &mut Sorter<MergeFn>,
field_id: FieldId, field_id: FieldId,
document_id: DocumentId, document_id: DocumentId,
value: &FacetValue, value: OrderedFloat<f64>,
) -> anyhow::Result<()> ) -> anyhow::Result<()>
{ {
use FacetValue::*; let key = FieldDocIdFacetF64Codec::bytes_encode(&(field_id, document_id, *value))
.map(Cow::into_owned)
.context("could not serialize facet key")?;
let result = match value { if lmdb_key_valid_size(&key) {
String(s) => FieldDocIdFacetStringCodec::bytes_encode(&(field_id, document_id, s)).map(Cow::into_owned), sorter.insert(&key, &[])?;
Number(f) => FieldDocIdFacetF64Codec::bytes_encode(&(field_id, document_id, **f)).map(Cow::into_owned), }
};
Ok(())
}
fn write_field_id_docid_facet_string_value(
sorter: &mut Sorter<MergeFn>,
field_id: FieldId,
document_id: DocumentId,
value: &str,
) -> anyhow::Result<()>
{
let key = FieldDocIdFacetStringCodec::bytes_encode(&(field_id, document_id, value))
.map(Cow::into_owned)
.context("could not serialize facet key")?;
let key = result.context("could not serialize facet key")?;
if lmdb_key_valid_size(&key) { if lmdb_key_valid_size(&key) {
sorter.insert(&key, &[])?; sorter.insert(&key, &[])?;
} }
@ -493,7 +583,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
let mut before = Instant::now(); let mut before = Instant::now();
let mut words_positions = HashMap::new(); let mut words_positions = HashMap::new();
let mut facet_values = HashMap::new(); let mut facet_numbers_values = HashMap::new();
let mut facet_strings_values = HashMap::new();
let mut count: usize = 0; let mut count: usize = 0;
while let Some((key, value)) = documents.next()? { while let Some((key, value)) = documents.next()? {
@ -513,32 +604,12 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
} }
for (attr, content) in document.iter() { for (attr, content) in document.iter() {
if self.faceted_fields.contains_key(&attr) || self.searchable_fields.contains(&attr) { if self.faceted_fields.contains(&attr) || self.searchable_fields.contains(&attr) {
let value = serde_json::from_slice(content)?; let value = serde_json::from_slice(content)?;
if let Some(ftype) = self.faceted_fields.get(&attr) { let (facet_numbers, facet_strings) = extract_facet_values(&value);
let mut values = match parse_facet_value(*ftype, &value) { facet_numbers_values.entry(attr).or_insert_with(Vec::new).extend(facet_numbers);
Ok(values) => values, facet_strings_values.entry(attr).or_insert_with(Vec::new).extend(facet_strings);
Err(e) => {
// We extract the name of the attribute and the document id
// to help users debug a facet type conversion.
let attr_name = self.fields_ids_map.name(attr).unwrap();
let document_id: Value = self.fields_ids_map.id(&self.primary_key)
.and_then(|fid| document.get(fid))
.map(serde_json::from_slice)
.unwrap()?;
let context = format!(
"while extracting facet from the {:?} attribute in the {} document",
attr_name, document_id,
);
warn!("{}", e.context(context));
SmallVec8::default()
},
};
facet_values.entry(attr).or_insert_with(SmallVec8::new).extend(values.drain(..));
}
if self.searchable_fields.contains(&attr) { if self.searchable_fields.contains(&attr) {
let content = match json_to_string(&value) { let content = match json_to_string(&value) {
@ -558,7 +629,13 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
} }
// We write the document in the documents store. // We write the document in the documents store.
self.write_document(document_id, &mut words_positions, &mut facet_values, value)?; self.write_document(
document_id,
&mut words_positions,
&mut facet_numbers_values,
&mut facet_strings_values,
value,
)?;
} }
// Compute the document id of the next document. // Compute the document id of the next document.
@ -585,9 +662,14 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
&mut self.words_pairs_proximities_docids_sorter, &mut self.words_pairs_proximities_docids_sorter,
self.words_pairs_proximities_docids, self.words_pairs_proximities_docids,
)?; )?;
Self::write_facet_field_value_docids( Self::write_facet_field_number_docids(
&mut self.facet_field_value_docids_sorter, &mut self.facet_field_numbers_docids_sorter,
self.facet_field_value_docids, self.facet_field_number_docids,
)?;
Self::write_facet_field_string_docids(
&mut self.facet_field_strings_docids_sorter,
self.facet_field_string_docids,
)?; )?;
let mut word_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; let mut word_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
@ -613,18 +695,26 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
let mut word_level_position_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; let mut word_level_position_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?; self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?;
let mut facet_field_value_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; let mut facet_field_numbers_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.facet_field_value_docids_sorter.write_into(&mut facet_field_value_docids_wtr)?; self.facet_field_numbers_docids_sorter.write_into(&mut facet_field_numbers_docids_wtr)?;
let mut field_id_docid_facet_values_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; let mut facet_field_strings_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.field_id_docid_facet_values_sorter.write_into(&mut field_id_docid_facet_values_wtr)?; self.facet_field_strings_docids_sorter.write_into(&mut facet_field_strings_docids_wtr)?;
let mut field_id_docid_facet_numbers_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.field_id_docid_facet_numbers_sorter.write_into(&mut field_id_docid_facet_numbers_wtr)?;
let mut field_id_docid_facet_strings_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.field_id_docid_facet_strings_sorter.write_into(&mut field_id_docid_facet_strings_wtr)?;
let main = writer_into_reader(main_wtr, shrink_size)?; let main = writer_into_reader(main_wtr, shrink_size)?;
let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?; let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?;
let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?; let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?;
let word_level_position_docids = writer_into_reader(word_level_position_docids_wtr, shrink_size)?; let word_level_position_docids = writer_into_reader(word_level_position_docids_wtr, shrink_size)?;
let facet_field_value_docids = writer_into_reader(facet_field_value_docids_wtr, shrink_size)?; let facet_field_numbers_docids = writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?;
let field_id_docid_facet_values = writer_into_reader(field_id_docid_facet_values_wtr, shrink_size)?; let facet_field_strings_docids = writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?;
let field_id_docid_facet_numbers = writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?;
let field_id_docid_facet_strings = writer_into_reader(field_id_docid_facet_strings_wtr, shrink_size)?;
let docid_word_positions = writer_into_reader(self.docid_word_positions_writer, shrink_size)?; let docid_word_positions = writer_into_reader(self.docid_word_positions_writer, shrink_size)?;
let documents = writer_into_reader(self.documents_writer, shrink_size)?; let documents = writer_into_reader(self.documents_writer, shrink_size)?;
@ -634,8 +724,10 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
docid_word_positions, docid_word_positions,
words_pairs_proximities_docids, words_pairs_proximities_docids,
word_level_position_docids, word_level_position_docids,
facet_field_value_docids, facet_field_numbers_docids,
field_id_docid_facet_values, facet_field_strings_docids,
field_id_docid_facet_numbers,
field_id_docid_facet_strings,
documents, documents,
}) })
} }
@ -710,71 +802,35 @@ fn process_tokens<'a>(tokens: impl Iterator<Item = Token<'a>>) -> impl Iterator<
.filter(|(_, t)| t.is_word()) .filter(|(_, t)| t.is_word())
} }
fn parse_facet_value(ftype: FacetType, value: &Value) -> anyhow::Result<SmallVec8<FacetValue>> { fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<String>) {
use FacetValue::*; fn inner_extract_facet_values(
fn inner_parse_facet_value(
ftype: FacetType,
value: &Value, value: &Value,
can_recurse: bool, can_recurse: bool,
output: &mut SmallVec8<FacetValue>, output_numbers: &mut Vec<f64>,
) -> anyhow::Result<()> output_strings: &mut Vec<String>,
{ ) {
match value { match value {
Value::Null => Ok(()), Value::Null => (),
Value::Bool(b) => match ftype { Value::Bool(b) => output_strings.push(b.to_string()),
FacetType::String => { Value::Number(number) => if let Some(float) = number.as_f64() {
output.push(String(b.to_string())); output_numbers.push(float);
Ok(())
},
FacetType::Number => {
output.push(Number(OrderedFloat(if *b { 1.0 } else { 0.0 })));
Ok(())
},
},
Value::Number(number) => match ftype {
FacetType::String => {
output.push(String(number.to_string()));
Ok(())
},
FacetType::Number => match number.as_f64() {
Some(float) => {
output.push(Number(OrderedFloat(float)));
Ok(())
},
None => bail!("invalid facet type, expecting {} found number", ftype),
},
}, },
Value::String(string) => { Value::String(string) => {
// TODO must be normalized and not only lowercased.
let string = string.trim().to_lowercase(); let string = string.trim().to_lowercase();
match ftype { output_strings.push(string);
FacetType::String => {
output.push(String(string));
Ok(())
},
FacetType::Number => match string.parse() {
Ok(float) => {
output.push(Number(OrderedFloat(float)));
Ok(())
},
Err(_err) => bail!("invalid facet type, expecting {} found string", ftype),
},
}
}, },
Value::Array(values) => if can_recurse { Value::Array(values) => if can_recurse {
values.iter().map(|v| inner_parse_facet_value(ftype, v, false, output)).collect() for value in values {
} else { inner_extract_facet_values(value, false, output_numbers, output_strings);
bail!( }
"invalid facet type, expecting {} found array (recursive arrays are not supported)",
ftype,
);
}, },
Value::Object(_) => bail!("invalid facet type, expecting {} found object", ftype), Value::Object(_) => (),
} }
} }
let mut facet_values = SmallVec8::new(); let mut facet_number_values = Vec::new();
inner_parse_facet_value(ftype, value, true, &mut facet_values)?; let mut facet_string_values = Vec::new();
Ok(facet_values) inner_extract_facet_values(value, true, &mut facet_number_values, &mut facet_string_values);
(facet_number_values, facet_string_values)
} }

View File

@ -1,5 +1,4 @@
use std::collections::{BTreeSet, HashMap}; use std::collections::{BTreeSet, HashMap, HashSet};
use std::str::FromStr;
use anyhow::Context; use anyhow::Context;
use chrono::Utc; use chrono::Utc;
@ -11,7 +10,6 @@ use serde::{Deserialize, Deserializer, Serialize, Serializer};
use crate::{FieldsIdsMap, Index}; use crate::{FieldsIdsMap, Index};
use crate::criterion::Criterion; use crate::criterion::Criterion;
use crate::facet::FacetType;
use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep}; use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep};
use crate::update::index_documents::{IndexDocumentsMethod, Transform}; use crate::update::index_documents::{IndexDocumentsMethod, Transform};
@ -68,7 +66,7 @@ pub struct Settings<'a, 't, 'u, 'i> {
searchable_fields: Setting<Vec<String>>, searchable_fields: Setting<Vec<String>>,
displayed_fields: Setting<Vec<String>>, displayed_fields: Setting<Vec<String>>,
faceted_fields: Setting<HashMap<String, String>>, faceted_fields: Setting<HashSet<String>>,
criteria: Setting<Vec<String>>, criteria: Setting<Vec<String>>,
stop_words: Setting<BTreeSet<String>>, stop_words: Setting<BTreeSet<String>>,
distinct_attribute: Setting<String>, distinct_attribute: Setting<String>,
@ -123,7 +121,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
self.faceted_fields = Setting::Reset; self.faceted_fields = Setting::Reset;
} }
pub fn set_faceted_fields(&mut self, names_facet_types: HashMap<String, String>) { pub fn set_faceted_fields(&mut self, names_facet_types: HashSet<String>) {
self.faceted_fields = Setting::Set(names_facet_types); self.faceted_fields = Setting::Set(names_facet_types);
} }
@ -387,11 +385,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
match self.faceted_fields { match self.faceted_fields {
Setting::Set(ref fields) => { Setting::Set(ref fields) => {
let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
let mut new_facets = HashMap::new(); let mut new_facets = HashSet::new();
for (name, ty) in fields { for name in fields {
fields_ids_map.insert(name).context("field id limit exceeded")?; fields_ids_map.insert(name).context("field id limit exceeded")?;
let ty = FacetType::from_str(&ty)?; new_facets.insert(name.clone());
new_facets.insert(name.clone(), ty);
} }
self.index.put_faceted_fields(self.wtxn, &new_facets)?; self.index.put_faceted_fields(self.wtxn, &new_facets)?;
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
@ -445,9 +442,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use heed::EnvOpenOptions; use heed::EnvOpenOptions;
use maplit::{btreeset, hashmap}; use heed::types::ByteSlice;
use maplit::{btreeset, hashmap, hashset};
use big_s::S;
use crate::facet::FacetType;
use crate::update::{IndexDocuments, UpdateFormat}; use crate::update::{IndexDocuments, UpdateFormat};
use super::*; use super::*;
@ -622,37 +620,53 @@ mod tests {
// Set the faceted fields to be the age. // Set the faceted fields to be the age.
let mut wtxn = index.write_txn().unwrap(); let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, 0); let mut builder = Settings::new(&mut wtxn, &index, 0);
builder.set_faceted_fields(hashmap!{ "age".into() => "number".into() }); builder.set_faceted_fields(hashset!{ S("age") });
builder.execute(|_, _| ()).unwrap(); builder.execute(|_, _| ()).unwrap();
// Then index some documents. // Then index some documents.
let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; let content = &br#"[
{ "name": "kevin", "age": 23 },
{ "name": "kevina", "age": 21 },
{ "name": "benoit", "age": 34 }
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.update_format(UpdateFormat::Json);
builder.enable_autogenerate_docids(); builder.enable_autogenerate_docids();
builder.update_format(UpdateFormat::Csv);
builder.execute(content, |_, _| ()).unwrap(); builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap(); wtxn.commit().unwrap();
// Check that the displayed fields are correctly set. // Check that the displayed fields are correctly set.
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
let fields_ids = index.faceted_fields(&rtxn).unwrap(); let fields_ids = index.faceted_fields(&rtxn).unwrap();
assert_eq!(fields_ids, hashmap!{ "age".to_string() => FacetType::Number }); assert_eq!(fields_ids, hashset!{ S("age") });
// Only count the field_id 0 and level 0 facet values. // Only count the field_id 0 and level 0 facet values.
let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[0, 0]).unwrap().count(); // TODO we must support typed CSVs for numbers to be understood.
let count = index.facet_id_f64_docids
.remap_key_type::<ByteSlice>()
.prefix_iter(&rtxn, &[0, 0]).unwrap().count();
assert_eq!(count, 3); assert_eq!(count, 3);
drop(rtxn); drop(rtxn);
// Index a little more documents with new and current facets values. // Index a little more documents with new and current facets values.
let mut wtxn = index.write_txn().unwrap(); let mut wtxn = index.write_txn().unwrap();
let content = &b"name,age\nkevin2,23\nkevina2,21\nbenoit2,35\n"[..]; let content = &br#"[
{ "name": "kevin2", "age": 23 },
{ "name": "kevina2", "age": 21 },
{ "name": "benoit", "age": 35 }
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 2); let mut builder = IndexDocuments::new(&mut wtxn, &index, 2);
builder.update_format(UpdateFormat::Csv); builder.enable_autogenerate_docids();
builder.update_format(UpdateFormat::Json);
builder.execute(content, |_, _| ()).unwrap(); builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap(); wtxn.commit().unwrap();
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
// Only count the field_id 0 and level 0 facet values. // Only count the field_id 0 and level 0 facet values.
let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[0, 0]).unwrap().count(); // TODO we must support typed CSVs for numbers to be understood.
let count = index.facet_id_f64_docids
.remap_key_type::<ByteSlice>()
.prefix_iter(&rtxn, &[0, 0]).unwrap().count();
assert_eq!(count, 4); assert_eq!(count, 4);
} }
@ -819,10 +833,7 @@ mod tests {
let mut wtxn = index.write_txn().unwrap(); let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, 0); let mut builder = Settings::new(&mut wtxn, &index, 0);
builder.set_displayed_fields(vec!["hello".to_string()]); builder.set_displayed_fields(vec!["hello".to_string()]);
builder.set_faceted_fields(hashmap!{ builder.set_faceted_fields(hashset!{ S("age"), S("toto") });
"age".into() => "number".into(),
"toto".into() => "number".into(),
});
builder.set_criteria(vec!["asc(toto)".to_string()]); builder.set_criteria(vec!["asc(toto)".to_string()]);
builder.execute(|_, _| ()).unwrap(); builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap(); wtxn.commit().unwrap();