184: Transfer numbers and strings facets into the appropriate facet databases r=Kerollmops a=Kerollmops

This pull request is related to https://github.com/meilisearch/milli/issues/152 and changes the layout of the facets values, numbers and strings are now in dedicated databases and the user no more needs to define the type of the fields. No more conversion between the two types is done, numbers (floats and integers converted to f64) go to the facet float database and strings go to the strings facet database.

There is one related issue that I found regarding CSVs, the values in a CSV are always considered to be strings, [meilisearch/specifications#28](d916b57d74/text/0028-indexing-csv.md) fixes this issue by allowing the user to define the fields types using `:` in the "CSV Formatting Rules" section.

All previous tests on facets have been modified to pass again and I have also done hand-driven tests with the 115m songs dataset. Everything seems to be good!

Fixes #192.

Co-authored-by: Clément Renault <clement@meilisearch.com>
Co-authored-by: Kerollmops <clement@meilisearch.com>
This commit is contained in:
bors[bot] 2021-05-31 13:32:58 +00:00 committed by GitHub
commit 2f5e61bacb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 1046 additions and 963 deletions

View File

@ -30,7 +30,6 @@ use warp::{Filter, http::Response};
use warp::filters::ws::Message;
use milli::{FacetCondition, Index, MatchingWords, obkv_to_json, SearchResult, UpdateStore};
use milli::facet::FacetValue;
use milli::update::{IndexDocumentsMethod, Setting, UpdateBuilder, UpdateFormat};
use milli::update::UpdateIndexingStep::*;
@ -252,7 +251,7 @@ struct Settings {
searchable_attributes: Setting<Vec<String>>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
faceted_attributes: Setting<HashMap<String, String>>,
faceted_attributes: Setting<HashSet<String>>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
criteria: Setting<Vec<String>>,
@ -671,7 +670,7 @@ async fn main() -> anyhow::Result<()> {
struct Answer {
documents: Vec<Map<String, Value>>,
number_of_candidates: u64,
facets: BTreeMap<String, BTreeMap<FacetValue, u64>>,
facets: BTreeMap<String, BTreeMap<String, u64>>,
}
let disable_highlighting = opt.disable_highlighting;
@ -985,7 +984,7 @@ async fn main() -> anyhow::Result<()> {
#[cfg(test)]
mod tests {
use maplit::{btreeset,hashmap};
use maplit::{btreeset,hashmap, hashset};
use serde_test::{assert_tokens, Token};
use milli::update::Setting;
@ -997,7 +996,7 @@ mod tests {
let settings = Settings {
displayed_attributes: Setting::Set(vec!["name".to_string()]),
searchable_attributes: Setting::Set(vec!["age".to_string()]),
faceted_attributes: Setting::Set(hashmap! { "age".into() => "integer".into() }),
faceted_attributes: Setting::Set(hashset!{ "age".to_string() }),
criteria: Setting::Set(vec!["asc(age)".to_string()]),
stop_words: Setting::Set(btreeset! { "and".to_string() }),
synonyms: Setting::Set(hashmap!{ "alex".to_string() => vec!["alexey".to_string()] })

View File

@ -5,6 +5,7 @@ use std::{str, io, fmt};
use anyhow::Context;
use byte_unit::Byte;
use heed::EnvOpenOptions;
use milli::facet::FacetType;
use milli::{Index, TreeLevel};
use structopt::StructOpt;
@ -22,8 +23,11 @@ const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids";
const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids";
const WORD_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-level-position-docids";
const WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-prefix-level-position-docids";
const FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME: &str = "facet-field-id-value-docids";
const FIELD_ID_DOCID_FACET_VALUES_DB_NAME: &str = "field-id-docid-facet-values";
const FACET_ID_F64_DOCIDS_DB_NAME: &str = "facet-id-f64-docids";
const FACET_ID_STRING_DOCIDS_DB_NAME: &str = "facet-id-string-docids";
const FIELD_ID_DOCID_FACET_F64S_DB_NAME: &str = "field-id-docid-facet-f64s";
const FIELD_ID_DOCID_FACET_STRINGS_DB_NAME: &str = "field-id-docid-facet-strings";
const DOCUMENTS_DB_NAME: &str = "documents";
const ALL_DATABASE_NAMES: &[&str] = &[
@ -35,8 +39,10 @@ const ALL_DATABASE_NAMES: &[&str] = &[
WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME,
WORD_LEVEL_POSITION_DOCIDS_DB_NAME,
WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME,
FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME,
FIELD_ID_DOCID_FACET_VALUES_DB_NAME,
FACET_ID_F64_DOCIDS_DB_NAME,
FACET_ID_STRING_DOCIDS_DB_NAME,
FIELD_ID_DOCID_FACET_F64S_DB_NAME,
FIELD_ID_DOCID_FACET_STRINGS_DB_NAME,
DOCUMENTS_DB_NAME,
];
@ -108,8 +114,18 @@ enum Command {
prefixes: Vec<String>,
},
/// Outputs a CSV with the documents ids along with the facet values where it appears.
FacetValuesDocids {
/// Outputs a CSV with the documents ids along with the facet numbers where it appears.
FacetNumbersDocids {
/// Display the whole documents ids in details.
#[structopt(long)]
full_display: bool,
/// The field name in the document.
field_name: String,
},
/// Outputs a CSV with the documents ids along with the facet strings where it appears.
FacetStringsDocids {
/// Display the whole documents ids in details.
#[structopt(long)]
full_display: bool,
@ -149,8 +165,8 @@ enum Command {
internal_documents_ids: Vec<u32>,
},
/// Outputs some facets statistics for the given facet name.
FacetStats {
/// Outputs some facets numbers statistics for the given facet name.
FacetNumberStats {
/// The field name in the document.
field_name: String,
},
@ -243,8 +259,11 @@ fn main() -> anyhow::Result<()> {
WordsPrefixesDocids { full_display, prefixes } => {
words_prefixes_docids(&index, &rtxn, !full_display, prefixes)
},
FacetValuesDocids { full_display, field_name } => {
facet_values_docids(&index, &rtxn, !full_display, field_name)
FacetNumbersDocids { full_display, field_name } => {
facet_values_docids(&index, &rtxn, !full_display, FacetType::Number, field_name)
},
FacetStringsDocids { full_display, field_name } => {
facet_values_docids(&index, &rtxn, !full_display, FacetType::String, field_name)
},
WordsLevelPositionsDocids { full_display, words } => {
words_level_positions_docids(&index, &rtxn, !full_display, words)
@ -255,7 +274,7 @@ fn main() -> anyhow::Result<()> {
DocidsWordsPositions { full_display, internal_documents_ids } => {
docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids)
},
FacetStats { field_name } => facet_stats(&index, &rtxn, field_name),
FacetNumberStats { field_name } => facet_number_stats(&index, &rtxn, field_name),
AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn),
AverageNumberOfPositionsByWord => {
average_number_of_positions_by_word(&index, &rtxn)
@ -297,36 +316,22 @@ fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow:
}
/// Helper function that converts the facet value key to a unique type
/// that can be used to log or display purposes.
fn facet_values_iter<'txn, DC: 'txn, T>(
/// that can be used for log or display purposes.
fn facet_values_iter<'txn, KC: 'txn, DC: 'txn>(
rtxn: &'txn heed::RoTxn,
db: heed::Database<heed::types::ByteSlice, DC>,
db: heed::Database<KC, DC>,
field_id: u8,
facet_type: milli::facet::FacetType,
string_fn: impl Fn(&str) -> T + 'txn,
float_fn: impl Fn(u8, f64, f64) -> T + 'txn,
) -> heed::Result<Box<dyn Iterator<Item=heed::Result<(T, DC::DItem)>> + 'txn>>
) -> heed::Result<Box<dyn Iterator<Item=heed::Result<(KC::DItem, DC::DItem)>> + 'txn>>
where
KC: heed::BytesDecode<'txn>,
DC: heed::BytesDecode<'txn>,
{
use milli::facet::FacetType;
use milli::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec};
let iter = db
.remap_key_type::<heed::types::ByteSlice>()
.prefix_iter(&rtxn, &[field_id])?
.remap_key_type::<KC>();
let iter = db.prefix_iter(&rtxn, &[field_id])?;
match facet_type {
FacetType::String => {
let iter = iter.remap_key_type::<FacetValueStringCodec>()
.map(move |r| r.map(|((_, key), value)| (string_fn(key), value)));
Ok(Box::new(iter) as Box<dyn Iterator<Item=_>>)
},
FacetType::Number => {
let iter = iter.remap_key_type::<FacetLevelValueF64Codec>()
.map(move |r| r.map(|((_, level, left, right), value)| {
(float_fn(level, left, right), value)
}));
Ok(Box::new(iter))
},
}
}
fn facet_number_value_to_string<T: fmt::Debug>(level: u8, left: T, right: T) -> (u8, String) {
@ -352,9 +357,11 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
word_prefix_pair_proximity_docids,
word_level_position_docids,
word_prefix_level_position_docids,
facet_field_id_value_docids,
field_id_docid_facet_values: _,
documents
facet_id_f64_docids,
facet_id_string_docids,
field_id_docid_facet_f64s: _,
field_id_docid_facet_strings: _,
documents,
} = index;
let main_name = "main";
@ -365,7 +372,8 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
let word_pair_proximity_docids_name = "word_pair_proximity_docids";
let word_level_position_docids_name = "word_level_position_docids";
let word_prefix_level_position_docids_name = "word_prefix_level_position_docids";
let facet_field_id_value_docids_name = "facet_field_id_value_docids";
let facet_id_f64_docids_name = "facet_id_f64_docids";
let facet_id_string_docids_name = "facet_id_string_docids";
let documents_name = "documents";
let mut heap = BinaryHeap::with_capacity(limit + 1);
@ -437,27 +445,27 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
let faceted_fields = index.faceted_fields_ids(rtxn)?;
let fields_ids_map = index.fields_ids_map(rtxn)?;
for (field_id, field_type) in faceted_fields {
let facet_name = fields_ids_map.name(field_id).unwrap();
let db = facet_field_id_value_docids.remap_data_type::<ByteSlice>();
let iter = facet_values_iter(
rtxn,
db,
field_id,
field_type,
|key| key.to_owned(),
|level, left, right| {
for facet_id in faceted_fields {
let facet_name = fields_ids_map.name(facet_id).unwrap();
// List the facet numbers of this facet id.
let db = facet_id_f64_docids.remap_data_type::<ByteSlice>();
for result in facet_values_iter(rtxn, db, facet_id)? {
let ((_fid, level, left, right), value) = result?;
let mut output = facet_number_value_to_string(level, left, right).1;
let _ = write!(&mut output, " (level {})", level);
output
},
)?;
write!(&mut output, " (level {})", level)?;
let key = format!("{} {}", facet_name, output);
heap.push(Reverse((value.len(), key, facet_id_f64_docids_name)));
if heap.len() > limit { heap.pop(); }
}
for result in iter {
let (fvalue, value) = result?;
// List the facet strings of this facet id.
let db = facet_id_string_docids.remap_data_type::<ByteSlice>();
for result in facet_values_iter(rtxn, db, facet_id)? {
let ((_fid, fvalue), value) = result?;
let key = format!("{} {}", facet_name, fvalue);
heap.push(Reverse((value.len(), key, facet_field_id_value_docids_name)));
heap.push(Reverse((value.len(), key, facet_id_string_docids_name)));
if heap.len() > limit { heap.pop(); }
}
}
@ -536,31 +544,33 @@ fn words_prefixes_docids(
Ok(wtr.flush()?)
}
fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_name: String) -> anyhow::Result<()> {
fn facet_values_docids(
index: &Index,
rtxn: &heed::RoTxn,
debug: bool,
facet_type: FacetType,
field_name: String,
) -> anyhow::Result<()>
{
let fields_ids_map = index.fields_ids_map(&rtxn)?;
let faceted_fields = index.faceted_fields_ids(&rtxn)?;
let field_id = fields_ids_map.id(&field_name)
.with_context(|| format!("field {} not found", field_name))?;
let field_type = faceted_fields.get(&field_id)
.with_context(|| format!("field {} is not faceted", field_name))?;
if !faceted_fields.contains(&field_id) {
anyhow::bail!("field {} is not faceted", field_name);
}
let stdout = io::stdout();
let mut wtr = csv::Writer::from_writer(stdout.lock());
wtr.write_record(&["facet_value", "facet_level", "documents_count", "documents_ids"])?;
let db = index.facet_field_id_value_docids;
let iter = facet_values_iter(
rtxn,
db,
field_id,
*field_type,
|key| (0, key.to_owned()),
facet_number_value_to_string,
)?;
for result in iter {
let ((level, value), docids) = result?;
match facet_type {
FacetType::Number => {
wtr.write_record(&["facet_number", "facet_level", "documents_count", "documents_ids"])?;
for result in facet_values_iter(rtxn, index.facet_id_f64_docids, field_id)? {
let ((_fid, level, left, right), docids) = result?;
let value = facet_number_value_to_string(level, left, right).1;
let count = docids.len();
let docids = if debug {
format!("{:?}", docids)
@ -569,6 +579,21 @@ fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_nam
};
wtr.write_record(&[value, level.to_string(), count.to_string(), docids])?;
}
},
FacetType::String => {
wtr.write_record(&["facet_string", "documents_count", "documents_ids"])?;
for result in facet_values_iter(rtxn, index.facet_id_string_docids, field_id)? {
let ((_fid, value), docids) = result?;
let count = docids.len();
let docids = if debug {
format!("{:?}", docids)
} else {
format!("{:?}", docids.iter().collect::<Vec<_>>())
};
wtr.write_record(&[value.to_string(), count.to_string(), docids])?;
}
}
}
Ok(wtr.flush()?)
}
@ -684,31 +709,24 @@ fn docids_words_positions(
Ok(wtr.flush()?)
}
fn facet_stats(index: &Index, rtxn: &heed::RoTxn, field_name: String) -> anyhow::Result<()> {
fn facet_number_stats(index: &Index, rtxn: &heed::RoTxn, field_name: String) -> anyhow::Result<()> {
let fields_ids_map = index.fields_ids_map(&rtxn)?;
let faceted_fields = index.faceted_fields_ids(&rtxn)?;
let field_id = fields_ids_map.id(&field_name)
.with_context(|| format!("field {} not found", field_name))?;
let field_type = faceted_fields.get(&field_id)
.with_context(|| format!("field {} is not faceted", field_name))?;
let db = index.facet_field_id_value_docids;
let iter = facet_values_iter(
rtxn,
db,
field_id,
*field_type,
|_key| 0u8,
|level, _left, _right| level,
)?;
if !faceted_fields.contains(&field_id) {
anyhow::bail!("field {} is not faceted", field_name);
}
let iter = facet_values_iter(rtxn, index.facet_id_f64_docids, field_id)?;
println!("The database {:?} facet stats", field_name);
let mut level_size = 0;
let mut current_level = None;
for result in iter {
let (level, _) = result?;
let ((_fid, level, _left, _right), _) = result?;
if let Some(current) = current_level {
if current != level {
println!("\tnumber of groups at level {}: {}", current, level_size);
@ -843,7 +861,7 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a
use heed::types::ByteSlice;
let Index {
env: _,
env: _env,
main,
word_docids,
word_prefix_docids,
@ -852,8 +870,10 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a
word_prefix_pair_proximity_docids,
word_level_position_docids,
word_prefix_level_position_docids,
facet_field_id_value_docids,
field_id_docid_facet_values,
facet_id_f64_docids,
facet_id_string_docids,
field_id_docid_facet_f64s,
field_id_docid_facet_strings,
documents,
} = index;
@ -873,8 +893,11 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a
WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => word_prefix_pair_proximity_docids.as_polymorph(),
WORD_LEVEL_POSITION_DOCIDS_DB_NAME => word_level_position_docids.as_polymorph(),
WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME => word_prefix_level_position_docids.as_polymorph(),
FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME => facet_field_id_value_docids.as_polymorph(),
FIELD_ID_DOCID_FACET_VALUES_DB_NAME => field_id_docid_facet_values.as_polymorph(),
FACET_ID_F64_DOCIDS_DB_NAME => facet_id_f64_docids.as_polymorph(),
FACET_ID_STRING_DOCIDS_DB_NAME => facet_id_string_docids.as_polymorph(),
FIELD_ID_DOCID_FACET_F64S_DB_NAME => field_id_docid_facet_f64s.as_polymorph(),
FIELD_ID_DOCID_FACET_STRINGS_DB_NAME => field_id_docid_facet_strings.as_polymorph(),
DOCUMENTS_DB_NAME => documents.as_polymorph(),
unknown => anyhow::bail!("unknown database {:?}", unknown),
};

View File

@ -1,4 +1,4 @@
use std::collections::HashMap;
use std::collections::HashSet;
use std::fmt;
use anyhow::{Context, bail};
@ -6,8 +6,6 @@ use regex::Regex;
use serde::{Serialize, Deserialize};
use once_cell::sync::Lazy;
use crate::facet::FacetType;
static ASC_DESC_REGEX: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(asc|desc)\(([\w_-]+)\)"#).unwrap()
});
@ -33,7 +31,7 @@ pub enum Criterion {
}
impl Criterion {
pub fn from_str(faceted_attributes: &HashMap<String, FacetType>, txt: &str) -> anyhow::Result<Criterion> {
pub fn from_str(faceted_attributes: &HashSet<String>, txt: &str) -> anyhow::Result<Criterion> {
match txt {
"words" => Ok(Criterion::Words),
"typo" => Ok(Criterion::Typo),
@ -44,7 +42,9 @@ impl Criterion {
let caps = ASC_DESC_REGEX.captures(text).with_context(|| format!("unknown criterion name: {}", text))?;
let order = caps.get(1).unwrap().as_str();
let field_name = caps.get(2).unwrap().as_str();
faceted_attributes.get(field_name).with_context(|| format!("Can't use {:?} as a criterion as it isn't a faceted field.", field_name))?;
faceted_attributes.get(field_name).with_context(|| {
format!("Can't use {:?} as a criterion as it isn't a faceted field.", field_name)
})?;
match order {
"asc" => Ok(Criterion::Asc(field_name.to_string())),
"desc" => Ok(Criterion::Desc(field_name.to_string())),

View File

@ -1,5 +1,5 @@
use std::borrow::Cow;
use std::collections::HashMap;
use std::collections::{HashMap, HashSet};
use std::path::Path;
use anyhow::Context;
@ -14,24 +14,28 @@ use crate::{
BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec,
ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec,
};
use crate::facet::FacetType;
use crate::heed_codec::facet::{
FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
FacetValueStringCodec, FacetLevelValueF64Codec,
};
use crate::fields_ids_map::FieldsIdsMap;
pub const CRITERIA_KEY: &str = "criteria";
pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields";
pub const DISTINCT_ATTRIBUTE_KEY: &str = "distinct-attribute-key";
pub const DOCUMENTS_IDS_KEY: &str = "documents-ids";
pub const FACETED_DOCUMENTS_IDS_PREFIX: &str = "faceted-documents-ids";
pub const FACETED_FIELDS_KEY: &str = "faceted-fields";
pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map";
pub const FIELDS_DISTRIBUTION_KEY: &str = "fields-distribution";
pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map";
pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids";
pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids";
pub const PRIMARY_KEY_KEY: &str = "primary-key";
pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields";
pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids";
pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids";
pub const WORDS_FST_KEY: &str = "words-fst";
pub const STOP_WORDS_KEY: &str = "stop-words";
pub const STRING_FACETED_DOCUMENTS_IDS_PREFIX: &str = "string-faceted-documents-ids";
pub const SYNONYMS_KEY: &str = "synonyms";
pub const WORDS_FST_KEY: &str = "words-fst";
pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst";
const CREATED_AT_KEY: &str = "created-at";
const UPDATED_AT_KEY: &str = "updated-at";
@ -40,33 +44,45 @@ const UPDATED_AT_KEY: &str = "updated-at";
pub struct Index {
/// The LMDB environment which this index is associated with.
pub env: heed::Env,
/// Contains many different types (e.g. the fields ids map).
pub main: PolyDatabase,
/// A word and all the documents ids containing the word.
pub word_docids: Database<Str, RoaringBitmapCodec>,
/// A prefix of word and all the documents ids containing this prefix.
pub word_prefix_docids: Database<Str, RoaringBitmapCodec>,
/// Maps a word and a document id (u32) to all the positions where the given word appears.
pub docid_word_positions: Database<BEU32StrCodec, BoRoaringBitmapCodec>,
/// Maps the proximity between a pair of words with all the docids where this relation appears.
pub word_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>,
/// Maps the proximity between a pair of word and prefix with all the docids where this relation appears.
pub word_prefix_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>,
/// Maps the word, level and position range with the docids that corresponds to it.
pub word_level_position_docids: Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
/// Maps the level positions of a word prefix with all the docids where this prefix appears.
pub word_prefix_level_position_docids: Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
/// Maps the facet field id and the globally ordered value with the docids that corresponds to it.
pub facet_field_id_value_docids: Database<ByteSlice, CboRoaringBitmapCodec>,
/// Maps the document id, the facet field id and the globally ordered value.
pub field_id_docid_facet_values: Database<ByteSlice, Unit>,
/// Maps the facet field id, level and the number with the docids that corresponds to it.
pub facet_id_f64_docids: Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
/// Maps the facet field id and the string with the docids that corresponds to it.
pub facet_id_string_docids: Database<FacetValueStringCodec, CboRoaringBitmapCodec>,
/// Maps the document id, the facet field id and the numbers.
pub field_id_docid_facet_f64s: Database<FieldDocIdFacetF64Codec, Unit>,
/// Maps the document id, the facet field id and the strings.
pub field_id_docid_facet_strings: Database<FieldDocIdFacetStringCodec, Unit>,
/// Maps the document id to the document as an obkv store.
pub documents: Database<OwnedType<BEU32>, ObkvCodec>,
}
impl Index {
pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result<Index> {
options.max_dbs(11);
options.max_dbs(13);
let env = options.open(path)?;
let main = env.create_poly_database(Some("main"))?;
@ -77,20 +93,13 @@ impl Index {
let word_prefix_pair_proximity_docids = env.create_database(Some("word-prefix-pair-proximity-docids"))?;
let word_level_position_docids = env.create_database(Some("word-level-position-docids"))?;
let word_prefix_level_position_docids = env.create_database(Some("word-prefix-level-position-docids"))?;
let facet_field_id_value_docids = env.create_database(Some("facet-field-id-value-docids"))?;
let field_id_docid_facet_values = env.create_database(Some("field-id-docid-facet-values"))?;
let facet_id_f64_docids = env.create_database(Some("facet-id-f64-docids"))?;
let facet_id_string_docids = env.create_database(Some("facet-id-string-docids"))?;
let field_id_docid_facet_f64s = env.create_database(Some("field-id-docid-facet-f64s"))?;
let field_id_docid_facet_strings = env.create_database(Some("field-id-docid-facet-strings"))?;
let documents = env.create_database(Some("documents"))?;
{
let mut txn = env.write_txn()?;
// The db was just created, we update its metadata with the relevant information.
if main.get::<_, Str, SerdeJson<DateTime<Utc>>>(&txn, CREATED_AT_KEY)?.is_none() {
let now = Utc::now();
main.put::<_, Str, SerdeJson<DateTime<Utc>>>(&mut txn, UPDATED_AT_KEY, &now)?;
main.put::<_, Str, SerdeJson<DateTime<Utc>>>(&mut txn, CREATED_AT_KEY, &now)?;
txn.commit()?;
}
}
Index::initialize_creation_dates(&env, main)?;
Ok(Index {
env,
@ -102,12 +111,26 @@ impl Index {
word_prefix_pair_proximity_docids,
word_level_position_docids,
word_prefix_level_position_docids,
facet_field_id_value_docids,
field_id_docid_facet_values,
facet_id_f64_docids,
facet_id_string_docids,
field_id_docid_facet_f64s,
field_id_docid_facet_strings,
documents,
})
}
fn initialize_creation_dates(env: &heed::Env, main: PolyDatabase) -> heed::Result<()> {
let mut txn = env.write_txn()?;
// The db was just created, we update its metadata with the relevant information.
if main.get::<_, Str, SerdeJson<DateTime<Utc>>>(&txn, CREATED_AT_KEY)?.is_none() {
let now = Utc::now();
main.put::<_, Str, SerdeJson<DateTime<Utc>>>(&mut txn, UPDATED_AT_KEY, &now)?;
main.put::<_, Str, SerdeJson<DateTime<Utc>>>(&mut txn, CREATED_AT_KEY, &now)?;
txn.commit()?;
}
Ok(())
}
/// Create a write transaction to be able to write into the index.
pub fn write_txn(&self) -> heed::Result<RwTxn> {
self.env.write_txn()
@ -298,53 +321,97 @@ impl Index {
/* faceted fields */
/// Writes the facet fields associated with their facet type or `None` if
/// the facet type is currently unknown.
pub fn put_faceted_fields(&self, wtxn: &mut RwTxn, fields_types: &HashMap<String, FacetType>) -> heed::Result<()> {
self.main.put::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY, fields_types)
/// Writes the facet fields names in the database.
pub fn put_faceted_fields(&self, wtxn: &mut RwTxn, fields: &HashSet<String>) -> heed::Result<()> {
self.main.put::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY, fields)
}
/// Deletes the facet fields ids associated with their facet type.
/// Deletes the facet fields ids in the database.
pub fn delete_faceted_fields(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
self.main.delete::<_, Str>(wtxn, FACETED_FIELDS_KEY)
}
/// Returns the facet fields names associated with their facet type.
pub fn faceted_fields(&self, rtxn: &RoTxn) -> heed::Result<HashMap<String, FacetType>> {
/// Returns the facet fields names.
pub fn faceted_fields(&self, rtxn: &RoTxn) -> heed::Result<HashSet<String>> {
Ok(self.main.get::<_, Str, SerdeJson<_>>(rtxn, FACETED_FIELDS_KEY)?.unwrap_or_default())
}
/// Same as `faceted_fields`, but returns ids instead.
pub fn faceted_fields_ids(&self, rtxn: &RoTxn) -> heed::Result<HashMap<FieldId, FacetType>> {
pub fn faceted_fields_ids(&self, rtxn: &RoTxn) -> heed::Result<HashSet<FieldId>> {
let faceted_fields = self.faceted_fields(rtxn)?;
let fields_ids_map = self.fields_ids_map(rtxn)?;
let faceted_fields = faceted_fields
.iter()
.map(|(k, v)| {
let kid = fields_ids_map
.map(|k| {
fields_ids_map
.id(k)
.ok_or_else(|| format!("{:?} should be present in the field id map", k))
.expect("corrupted data: ");
(kid, *v)
.expect("corrupted data: ")
})
.collect();
Ok(faceted_fields)
}
/* faceted documents ids */
/// Writes the documents ids that are faceted under this field id.
pub fn put_faceted_documents_ids(&self, wtxn: &mut RwTxn, field_id: FieldId, docids: &RoaringBitmap) -> heed::Result<()> {
let mut buffer = [0u8; FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
buffer[..FACETED_DOCUMENTS_IDS_PREFIX.len()].clone_from_slice(FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
/// Writes the documents ids that are faceted with numbers under this field id.
pub fn put_number_faceted_documents_ids(
&self,
wtxn: &mut RwTxn,
field_id: FieldId,
docids: &RoaringBitmap,
) -> heed::Result<()>
{
let mut buffer = [0u8; STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
buffer[..STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()]
.copy_from_slice(STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
*buffer.last_mut().unwrap() = field_id;
self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids)
}
/// Retrieve all the documents ids that faceted under this field id.
pub fn faceted_documents_ids(&self, rtxn: &RoTxn, field_id: FieldId) -> heed::Result<RoaringBitmap> {
let mut buffer = [0u8; FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
buffer[..FACETED_DOCUMENTS_IDS_PREFIX.len()].clone_from_slice(FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
/// Retrieve all the documents ids that faceted with numbers under this field id.
pub fn number_faceted_documents_ids(
&self,
rtxn: &RoTxn,
field_id: FieldId,
) -> heed::Result<RoaringBitmap>
{
let mut buffer = [0u8; STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
buffer[..STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()]
.copy_from_slice(STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
*buffer.last_mut().unwrap() = field_id;
match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? {
Some(docids) => Ok(docids),
None => Ok(RoaringBitmap::new()),
}
}
/// Writes the documents ids that are faceted with strings under this field id.
pub fn put_string_faceted_documents_ids(
&self,
wtxn: &mut RwTxn,
field_id: FieldId,
docids: &RoaringBitmap,
) -> heed::Result<()>
{
let mut buffer = [0u8; NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
buffer[..NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()]
.copy_from_slice(NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
*buffer.last_mut().unwrap() = field_id;
self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids)
}
/// Retrieve all the documents ids that faceted with strings under this field id.
pub fn string_faceted_documents_ids(
&self,
rtxn: &RoTxn,
field_id: FieldId,
) -> heed::Result<RoaringBitmap>
{
let mut buffer = [0u8; NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1];
buffer[..NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()]
.copy_from_slice(NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes());
*buffer.last_mut().unwrap() = field_id;
match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? {
Some(docids) => Ok(docids),

View File

@ -1,18 +1,15 @@
use std::collections::HashMap;
use std::mem::take;
use anyhow::{bail, Context as _};
use anyhow::Context;
use itertools::Itertools;
use log::debug;
use ordered_float::OrderedFloat;
use roaring::RoaringBitmap;
use crate::facet::FacetType;
use crate::heed_codec::facet::FieldDocIdFacetF64Codec;
use crate::search::criteria::{resolve_query_tree, CriteriaBuilder};
use crate::search::facet::FacetIter;
use crate::search::query_tree::Operation;
use crate::{FieldsIdsMap, FieldId, Index};
use crate::{FieldId, Index};
use super::{Criterion, CriterionParameters, CriterionResult};
/// Threshold on the number of candidates that will make
@ -24,7 +21,6 @@ pub struct AscDesc<'t> {
rtxn: &'t heed::RoTxn<'t>,
field_name: String,
field_id: FieldId,
facet_type: FacetType,
ascending: bool,
query_tree: Option<Operation>,
candidates: Box<dyn Iterator<Item = heed::Result<RoaringBitmap>> + 't>,
@ -39,8 +35,7 @@ impl<'t> AscDesc<'t> {
rtxn: &'t heed::RoTxn,
parent: Box<dyn Criterion + 't>,
field_name: String,
) -> anyhow::Result<Self>
{
) -> anyhow::Result<Self> {
Self::new(index, rtxn, parent, field_name, true)
}
@ -49,8 +44,7 @@ impl<'t> AscDesc<'t> {
rtxn: &'t heed::RoTxn,
parent: Box<dyn Criterion + 't>,
field_name: String,
) -> anyhow::Result<Self>
{
) -> anyhow::Result<Self> {
Self::new(index, rtxn, parent, field_name, false)
}
@ -60,22 +54,21 @@ impl<'t> AscDesc<'t> {
parent: Box<dyn Criterion + 't>,
field_name: String,
ascending: bool,
) -> anyhow::Result<Self>
{
) -> anyhow::Result<Self> {
let fields_ids_map = index.fields_ids_map(rtxn)?;
let faceted_fields = index.faceted_fields(rtxn)?;
let (field_id, facet_type) = field_id_facet_type(&fields_ids_map, &faceted_fields, &field_name)?;
let field_id = fields_ids_map
.id(&field_name)
.with_context(|| format!("field {:?} isn't registered", field_name))?;
Ok(AscDesc {
index,
rtxn,
field_name,
field_id,
facet_type,
ascending,
query_tree: None,
candidates: Box::new(std::iter::empty()),
faceted_candidates: index.faceted_documents_ids(rtxn, field_id)?,
faceted_candidates: index.number_faceted_documents_ids(rtxn, field_id)?,
bucket_candidates: RoaringBitmap::new(),
parent,
})
@ -86,8 +79,10 @@ impl<'t> Criterion for AscDesc<'t> {
#[logging_timer::time("AscDesc::{}")]
fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result<Option<CriterionResult>> {
loop {
debug!("Facet {}({}) iteration",
if self.ascending { "Asc" } else { "Desc" }, self.field_name
debug!(
"Facet {}({}) iteration",
if self.ascending { "Asc" } else { "Desc" },
self.field_name
);
match self.candidates.next().transpose()? {
@ -122,7 +117,6 @@ impl<'t> Criterion for AscDesc<'t> {
self.index,
self.rtxn,
self.field_id,
self.facet_type,
self.ascending,
candidates,
)?;
@ -138,25 +132,10 @@ impl<'t> Criterion for AscDesc<'t> {
filtered_candidates: None,
bucket_candidates: Some(take(&mut self.bucket_candidates)),
}));
},
}
}
}
}
fn field_id_facet_type(
fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<String, FacetType>,
field: &str,
) -> anyhow::Result<(FieldId, FacetType)>
{
let id = fields_ids_map.id(field).with_context(|| {
format!("field {:?} isn't registered", field)
})?;
let facet_type = faceted_fields.get(field).with_context(|| {
format!("field {:?} isn't faceted", field)
})?;
Ok((id, *facet_type))
}
/// Returns an iterator over groups of the given candidates in ascending or descending order.
@ -167,17 +146,11 @@ fn facet_ordered<'t>(
index: &'t Index,
rtxn: &'t heed::RoTxn,
field_id: FieldId,
facet_type: FacetType,
ascending: bool,
candidates: RoaringBitmap,
) -> anyhow::Result<Box<dyn Iterator<Item = heed::Result<RoaringBitmap>> + 't>>
{
match facet_type {
FacetType::Number => {
) -> anyhow::Result<Box<dyn Iterator<Item = heed::Result<RoaringBitmap>> + 't>> {
if candidates.len() <= CANDIDATES_THRESHOLD {
let iter = iterative_facet_ordered_iter(
index, rtxn, field_id, ascending, candidates,
)?;
let iter = iterative_facet_ordered_iter(index, rtxn, field_id, ascending, candidates)?;
Ok(Box::new(iter.map(Ok)) as Box<dyn Iterator<Item = _>>)
} else {
let facet_fn = if ascending {
@ -188,9 +161,6 @@ fn facet_ordered<'t>(
let iter = facet_fn(rtxn, index, field_id, candidates)?;
Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids))))
}
},
FacetType::String => bail!("criteria facet type must be a number"),
}
}
/// Fetch the whole list of candidates facet values one by one and order them by it.
@ -202,14 +172,14 @@ fn iterative_facet_ordered_iter<'t>(
field_id: FieldId,
ascending: bool,
candidates: RoaringBitmap,
) -> anyhow::Result<impl Iterator<Item = RoaringBitmap> + 't>
{
let db = index.field_id_docid_facet_values.remap_key_type::<FieldDocIdFacetF64Codec>();
) -> anyhow::Result<impl Iterator<Item = RoaringBitmap> + 't> {
let mut docids_values = Vec::with_capacity(candidates.len() as usize);
for docid in candidates.iter() {
let left = (field_id, docid, f64::MIN);
let right = (field_id, docid, f64::MAX);
let mut iter = db.range(rtxn, &(left..=right))?;
let mut iter = index
.field_id_docid_facet_f64s
.range(rtxn, &(left..=right))?;
let entry = if ascending { iter.next() } else { iter.last() };
if let Some(((_, _, value), ())) = entry.transpose()? {
docids_values.push((docid, OrderedFloat(value)));
@ -226,7 +196,8 @@ fn iterative_facet_ordered_iter<'t>(
// The itertools GroupBy iterator doesn't provide an owned version, we are therefore
// required to collect the result into an owned collection (a Vec).
// https://github.com/rust-itertools/itertools/issues/499
let vec: Vec<_> = iter.group_by(|(_, v)| *v)
let vec: Vec<_> = iter
.group_by(|(_, v)| v.clone())
.into_iter()
.map(|(_, ids)| ids.map(|(id, _)| id).collect())
.collect();

View File

@ -1,10 +1,14 @@
use std::mem::size_of;
use heed::types::ByteSlice;
use roaring::RoaringBitmap;
use crate::heed_codec::facet::*;
use crate::{facet::FacetType, DocumentId, FieldId, Index};
use super::{Distinct, DocIter};
use crate::heed_codec::facet::*;
use crate::{DocumentId, FieldId, Index};
const FID_SIZE: usize = size_of::<FieldId>();
const DOCID_SIZE: usize = size_of::<DocumentId>();
/// A distinct implementer that is backed by facets.
///
@ -18,21 +22,14 @@ pub struct FacetDistinct<'a> {
distinct: FieldId,
index: &'a Index,
txn: &'a heed::RoTxn<'a>,
facet_type: FacetType,
}
impl<'a> FacetDistinct<'a> {
pub fn new(
distinct: FieldId,
index: &'a Index,
txn: &'a heed::RoTxn<'a>,
facet_type: FacetType,
) -> Self {
pub fn new(distinct: FieldId, index: &'a Index, txn: &'a heed::RoTxn<'a>) -> Self {
Self {
distinct,
index,
txn,
facet_type,
}
}
}
@ -41,38 +38,33 @@ pub struct FacetDistinctIter<'a> {
candidates: RoaringBitmap,
distinct: FieldId,
excluded: RoaringBitmap,
facet_type: FacetType,
index: &'a Index,
iter_offset: usize,
txn: &'a heed::RoTxn<'a>,
}
impl<'a> FacetDistinctIter<'a> {
fn get_facet_docids<'c, KC>(&self, key: &'c KC::EItem) -> anyhow::Result<RoaringBitmap>
where
KC: heed::BytesEncode<'c>,
{
let facet_docids = self
.index
.facet_field_id_value_docids
.remap_key_type::<KC>()
.get(self.txn, key)?
.expect("Corrupted data: Facet values must exist");
Ok(facet_docids)
fn facet_string_docids(&self, key: &str) -> heed::Result<Option<RoaringBitmap>> {
self.index
.facet_id_string_docids
.get(self.txn, &(self.distinct, key))
}
fn facet_number_docids(&self, key: f64) -> heed::Result<Option<RoaringBitmap>> {
// get facet docids on level 0
self.index
.facet_id_f64_docids
.get(self.txn, &(self.distinct, 0, key, key))
}
fn distinct_string(&mut self, id: DocumentId) -> anyhow::Result<()> {
let iter = get_facet_values::<FieldDocIdFacetStringCodec>(
id,
self.distinct,
self.index,
self.txn,
)?;
let iter = facet_string_values(id, self.distinct, self.index, self.txn)?;
for item in iter {
let ((_, _, value), _) = item?;
let key = (self.distinct, value);
let facet_docids = self.get_facet_docids::<FacetValueStringCodec>(&key)?;
let facet_docids = self
.facet_string_docids(value)?
.expect("Corrupted data: Facet values must exist");
self.excluded.union_with(&facet_docids);
}
@ -82,17 +74,13 @@ impl<'a> FacetDistinctIter<'a> {
}
fn distinct_number(&mut self, id: DocumentId) -> anyhow::Result<()> {
let iter = get_facet_values::<FieldDocIdFacetF64Codec>(id,
self.distinct,
self.index,
self.txn,
)?;
let iter = facet_number_values(id, self.distinct, self.index, self.txn)?;
for item in iter {
let ((_, _, value), _) = item?;
// get facet docids on level 0
let key = (self.distinct, 0, value, value);
let facet_docids = self.get_facet_docids::<FacetLevelValueF64Codec>(&key)?;
let facet_docids = self
.facet_number_docids(value)?
.expect("Corrupted data: Facet values must exist");
self.excluded.union_with(&facet_docids);
}
@ -111,16 +99,16 @@ impl<'a> FacetDistinctIter<'a> {
let mut candidates_iter = self.candidates.iter().skip(self.iter_offset);
match candidates_iter.next() {
Some(id) => {
match self.facet_type {
FacetType::String => self.distinct_string(id)?,
FacetType::Number => self.distinct_number(id)?,
};
// We distinct the document id on its facet strings and facet numbers.
self.distinct_string(id)?;
self.distinct_number(id)?;
// The first document of each iteration is kept, since the next call to
// `difference_with` will filter out all the documents for that facet value. By
// increasing the offset we make sure to get the first valid value for the next
// distinct document to keep.
self.iter_offset += 1;
Ok(Some(id))
}
// no more candidate at this offset, return.
@ -129,26 +117,44 @@ impl<'a> FacetDistinctIter<'a> {
}
}
fn get_facet_values<'a, KC>(
fn facet_values_prefix_key(distinct: FieldId, id: DocumentId) -> [u8; FID_SIZE + DOCID_SIZE] {
let mut key = [0; FID_SIZE + DOCID_SIZE];
key[0..FID_SIZE].copy_from_slice(&distinct.to_be_bytes());
key[FID_SIZE..].copy_from_slice(&id.to_be_bytes());
key
}
fn facet_number_values<'a>(
id: DocumentId,
distinct: FieldId,
index: &Index,
txn: &'a heed::RoTxn,
) -> anyhow::Result<heed::RoPrefix<'a, KC, heed::types::Unit>>
where
KC: heed::BytesDecode<'a>,
{
const FID_SIZE: usize = size_of::<FieldId>();
const DOCID_SIZE: usize = size_of::<DocumentId>();
let mut key = [0; FID_SIZE + DOCID_SIZE];
key[0..FID_SIZE].copy_from_slice(&distinct.to_be_bytes());
key[FID_SIZE..].copy_from_slice(&id.to_be_bytes());
) -> anyhow::Result<heed::RoPrefix<'a, FieldDocIdFacetF64Codec, heed::types::Unit>> {
let key = facet_values_prefix_key(distinct, id);
let iter = index
.field_id_docid_facet_values
.field_id_docid_facet_f64s
.remap_key_type::<ByteSlice>()
.prefix_iter(txn, &key)?
.remap_key_type::<KC>();
.remap_key_type::<FieldDocIdFacetF64Codec>();
Ok(iter)
}
fn facet_string_values<'a>(
id: DocumentId,
distinct: FieldId,
index: &Index,
txn: &'a heed::RoTxn,
) -> anyhow::Result<heed::RoPrefix<'a, FieldDocIdFacetStringCodec, heed::types::Unit>> {
let key = facet_values_prefix_key(distinct, id);
let iter = index
.field_id_docid_facet_strings
.remap_key_type::<ByteSlice>()
.prefix_iter(txn, &key)?
.remap_key_type::<FieldDocIdFacetStringCodec>();
Ok(iter)
}
@ -174,7 +180,6 @@ impl<'a> Distinct<'_> for FacetDistinct<'a> {
candidates,
distinct: self.distinct,
excluded,
facet_type: self.facet_type,
index: self.index,
iter_offset: 0,
txn: self.txn,
@ -184,22 +189,21 @@ impl<'a> Distinct<'_> for FacetDistinct<'a> {
#[cfg(test)]
mod test {
use std::collections::HashMap;
use std::collections::HashSet;
use super::*;
use super::super::test::{generate_index, validate_distinct_candidates};
use crate::facet::FacetType;
use super::*;
macro_rules! test_facet_distinct {
($name:ident, $distinct:literal, $facet_type:expr) => {
($name:ident, $distinct:literal) => {
#[test]
fn $name() {
use std::iter::FromIterator;
let facets = HashMap::from_iter(Some(($distinct.to_string(), $facet_type.to_string())));
let facets = HashSet::from_iter(Some(($distinct.to_string())));
let (index, fid, candidates) = generate_index($distinct, facets);
let txn = index.read_txn().unwrap();
let mut map_distinct = FacetDistinct::new(fid, &index, &txn, $facet_type);
let mut map_distinct = FacetDistinct::new(fid, &index, &txn);
let excluded = RoaringBitmap::new();
let mut iter = map_distinct.distinct(candidates.clone(), excluded);
let count = validate_distinct_candidates(iter.by_ref(), fid, &index);
@ -209,7 +213,7 @@ mod test {
};
}
test_facet_distinct!(test_string, "txt", FacetType::String);
test_facet_distinct!(test_strings, "txts", FacetType::String);
test_facet_distinct!(test_number, "cat-int", FacetType::Number);
test_facet_distinct!(test_string, "txt");
test_facet_distinct!(test_strings, "txts");
test_facet_distinct!(test_number, "cat-int");
}

View File

@ -110,7 +110,7 @@ impl<'a, 'b> Distinct<'b> for MapDistinct<'a> {
#[cfg(test)]
mod test {
use std::collections::HashMap;
use std::collections::HashSet;
use super::*;
use super::super::test::{generate_index, validate_distinct_candidates};
@ -119,7 +119,7 @@ mod test {
($name:ident, $distinct:literal) => {
#[test]
fn $name() {
let (index, fid, candidates) = generate_index($distinct, HashMap::new());
let (index, fid, candidates) = generate_index($distinct, HashSet::new());
let txn = index.read_txn().unwrap();
let mut map_distinct = MapDistinct::new(fid, &index, &txn);
let excluded = RoaringBitmap::new();

View File

@ -28,7 +28,7 @@ pub trait Distinct<'a> {
#[cfg(test)]
mod test {
use std::collections::{HashMap, HashSet};
use std::collections::HashSet;
use once_cell::sync::Lazy;
use rand::{seq::SliceRandom, Rng};
@ -74,7 +74,7 @@ mod test {
/// Returns a temporary index populated with random test documents, the FieldId for the
/// distinct attribute, and the RoaringBitmap with the document ids.
pub(crate) fn generate_index(distinct: &str, facets: HashMap<String, String>) -> (TempIndex, FieldId, RoaringBitmap) {
pub(crate) fn generate_index(distinct: &str, facets: HashSet<String>) -> (TempIndex, FieldId, RoaringBitmap) {
let index = TempIndex::new();
let mut txn = index.write_txn().unwrap();

View File

@ -1,9 +1,8 @@
use std::collections::HashMap;
use std::collections::HashSet;
use std::fmt::Debug;
use std::ops::Bound::{self, Included, Excluded};
use std::str::FromStr;
use anyhow::Context;
use either::Either;
use heed::types::DecodeIgnore;
use log::debug;
@ -12,7 +11,6 @@ use pest::iterators::{Pair, Pairs};
use pest::Parser;
use roaring::RoaringBitmap;
use crate::facet::FacetType;
use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec};
use crate::{Index, FieldId, FieldsIdsMap, CboRoaringBitmapCodec};
@ -21,122 +19,96 @@ use super::parser::Rule;
use super::parser::{PREC_CLIMBER, FilterParser};
use self::FacetCondition::*;
use self::FacetNumberOperator::*;
use self::Operator::*;
#[derive(Debug, Copy, Clone, PartialEq)]
pub enum FacetNumberOperator {
#[derive(Debug, Clone, PartialEq)]
pub enum Operator {
GreaterThan(f64),
GreaterThanOrEqual(f64),
Equal(f64),
NotEqual(f64),
Equal(Option<f64>, String),
NotEqual(Option<f64>, String),
LowerThan(f64),
LowerThanOrEqual(f64),
Between(f64, f64),
}
impl FacetNumberOperator {
impl Operator {
/// This method can return two operations in case it must express
/// an OR operation for the between case (i.e. `TO`).
fn negate(self) -> (Self, Option<Self>) {
match self {
GreaterThan(x) => (LowerThanOrEqual(x), None),
GreaterThanOrEqual(x) => (LowerThan(x), None),
Equal(x) => (NotEqual(x), None),
NotEqual(x) => (Equal(x), None),
LowerThan(x) => (GreaterThanOrEqual(x), None),
LowerThanOrEqual(x) => (GreaterThan(x), None),
Between(x, y) => (LowerThan(x), Some(GreaterThan(y))),
}
}
}
#[derive(Debug, Clone, PartialEq)]
pub enum FacetStringOperator {
Equal(String),
NotEqual(String),
}
impl FacetStringOperator {
fn equal(s: &str) -> Self {
FacetStringOperator::Equal(s.to_lowercase())
}
#[allow(dead_code)]
fn not_equal(s: &str) -> Self {
FacetStringOperator::equal(s).negate()
}
fn negate(self) -> Self {
match self {
FacetStringOperator::Equal(x) => FacetStringOperator::NotEqual(x),
FacetStringOperator::NotEqual(x) => FacetStringOperator::Equal(x),
GreaterThan(n) => (LowerThanOrEqual(n), None),
GreaterThanOrEqual(n) => (LowerThan(n), None),
Equal(n, s) => (NotEqual(n, s), None),
NotEqual(n, s) => (Equal(n, s), None),
LowerThan(n) => (GreaterThanOrEqual(n), None),
LowerThanOrEqual(n) => (GreaterThan(n), None),
Between(n, m) => (LowerThan(n), Some(GreaterThan(m))),
}
}
}
#[derive(Debug, Clone, PartialEq)]
pub enum FacetCondition {
OperatorString(FieldId, FacetStringOperator),
OperatorNumber(FieldId, FacetNumberOperator),
Operator(FieldId, Operator),
Or(Box<Self>, Box<Self>),
And(Box<Self>, Box<Self>),
}
fn get_field_id_facet_type<'a>(
fn field_id(
fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<FieldId, FacetType>,
items: &mut Pairs<'a, Rule>,
) -> Result<(FieldId, FacetType), PestError<Rule>>
faceted_fields: &HashSet<FieldId>,
items: &mut Pairs<Rule>,
) -> Result<FieldId, PestError<Rule>>
{
// lexing ensures that we at least have a key
let key = items.next().unwrap();
let field_id = fields_ids_map
.id(key.as_str())
.ok_or_else(|| {
PestError::new_from_span(
let field_id = match fields_ids_map.id(key.as_str()) {
Some(field_id) => field_id,
None => return Err(PestError::new_from_span(
ErrorVariant::CustomError {
message: format!(
"attribute `{}` not found, available attributes are: {}",
key.as_str(),
fields_ids_map.iter().map(|(_, n)| n).collect::<Vec<_>>().join(", ")
fields_ids_map.iter().map(|(_, n)| n).collect::<Vec<_>>().join(", "),
),
},
key.as_span(),
)
})?;
)),
};
let facet_type = faceted_fields
.get(&field_id)
.copied()
.ok_or_else(|| {
PestError::new_from_span(
if !faceted_fields.contains(&field_id) {
return Err(PestError::new_from_span(
ErrorVariant::CustomError {
message: format!(
"attribute `{}` is not faceted, available faceted attributes are: {}",
key.as_str(),
faceted_fields.keys().flat_map(|id| fields_ids_map.name(*id)).collect::<Vec<_>>().join(", ")
faceted_fields.iter().flat_map(|id| {
fields_ids_map.name(*id)
}).collect::<Vec<_>>().join(", "),
),
},
key.as_span(),
)
})?;
Ok((field_id, facet_type))
));
}
fn pest_parse<T>(pair: Pair<Rule>) -> Result<T, pest::error::Error<Rule>>
Ok(field_id)
}
fn pest_parse<T>(pair: Pair<Rule>) -> (Result<T, pest::error::Error<Rule>>, String)
where T: FromStr,
T::Err: ToString,
{
match pair.as_str().parse() {
let result = match pair.as_str().parse::<T>() {
Ok(value) => Ok(value),
Err(e) => {
Err(PestError::<Rule>::new_from_span(
Err(e) => Err(PestError::<Rule>::new_from_span(
ErrorVariant::CustomError { message: e.to_string() },
pair.as_span(),
))
}
}
)),
};
(result, pair.as_str().to_string())
}
impl FacetCondition {
@ -150,34 +122,6 @@ impl FacetCondition {
A: AsRef<str>,
B: AsRef<str>,
{
fn facet_condition(
fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<String, FacetType>,
key: &str,
value: &str,
) -> anyhow::Result<FacetCondition>
{
let fid = fields_ids_map.id(key).with_context(|| {
format!("{:?} isn't present in the fields ids map", key)
})?;
let ftype = faceted_fields.get(key).copied().with_context(|| {
format!("{:?} isn't a faceted field", key)
})?;
let (neg, value) = match value.trim().strip_prefix('-') {
Some(value) => (true, value.trim()),
None => (false, value.trim()),
};
let operator = match ftype {
FacetType::String => OperatorString(fid, FacetStringOperator::equal(value)),
FacetType::Number => OperatorNumber(fid, FacetNumberOperator::Equal(value.parse()?)),
};
if neg { Ok(operator.negate()) } else { Ok(operator) }
}
let fields_ids_map = index.fields_ids_map(rtxn)?;
let faceted_fields = index.faceted_fields(rtxn)?;
let mut ands = None;
for either in array {
@ -185,10 +129,7 @@ impl FacetCondition {
Either::Left(array) => {
let mut ors = None;
for rule in array {
let mut iter = rule.as_ref().splitn(2, ':');
let key = iter.next().context("missing facet condition key")?;
let value = iter.next().context("missing facet condition value")?;
let condition = facet_condition(&fields_ids_map, &faceted_fields, key, value)?;
let condition = FacetCondition::from_str(rtxn, index, rule.as_ref())?;
ors = match ors.take() {
Some(ors) => Some(Or(Box::new(ors), Box::new(condition))),
None => Some(condition),
@ -203,10 +144,7 @@ impl FacetCondition {
}
},
Either::Right(rule) => {
let mut iter = rule.as_ref().splitn(2, ':');
let key = iter.next().context("missing facet condition key")?;
let value = iter.next().context("missing facet condition value")?;
let condition = facet_condition(&fields_ids_map, &faceted_fields, key, value)?;
let condition = FacetCondition::from_str(rtxn, index, rule.as_ref())?;
ands = match ands.take() {
Some(ands) => Some(And(Box::new(ands), Box::new(condition))),
None => Some(condition),
@ -232,7 +170,7 @@ impl FacetCondition {
fn from_pairs(
fim: &FieldsIdsMap,
ff: &HashMap<FieldId, FacetType>,
ff: &HashSet<FieldId>,
expression: Pairs<Rule>,
) -> anyhow::Result<Self>
{
@ -263,10 +201,9 @@ impl FacetCondition {
fn negate(self) -> FacetCondition {
match self {
OperatorString(fid, op) => OperatorString(fid, op.negate()),
OperatorNumber(fid, op) => match op.negate() {
(op, None) => OperatorNumber(fid, op),
(a, Some(b)) => Or(Box::new(OperatorNumber(fid, a)), Box::new(OperatorNumber(fid, b))),
Operator(fid, op) => match op.negate() {
(op, None) => Operator(fid, op),
(a, Some(b)) => Or(Box::new(Operator(fid, a)), Box::new(Operator(fid, b))),
},
Or(a, b) => And(Box::new(a.negate()), Box::new(b.negate())),
And(a, b) => Or(Box::new(a.negate()), Box::new(b.negate())),
@ -275,137 +212,96 @@ impl FacetCondition {
fn between(
fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<FieldId, FacetType>,
faceted_fields: &HashSet<FieldId>,
item: Pair<Rule>,
) -> anyhow::Result<FacetCondition>
{
let item_span = item.as_span();
let mut items = item.into_inner();
let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?;
let lvalue = items.next().unwrap();
let rvalue = items.next().unwrap();
match ftype {
FacetType::String => {
Err(PestError::<Rule>::new_from_span(
ErrorVariant::CustomError {
message: "invalid operator on a faceted string".to_string(),
},
item_span,
).into())
},
FacetType::Number => {
let lvalue = pest_parse(lvalue)?;
let rvalue = pest_parse(rvalue)?;
Ok(OperatorNumber(fid, Between(lvalue, rvalue)))
},
}
let fid = field_id(fields_ids_map, faceted_fields, &mut items)?;
let (lresult, _) = pest_parse(items.next().unwrap());
let (rresult, _) = pest_parse(items.next().unwrap());
let lvalue = lresult?;
let rvalue = rresult?;
Ok(Operator(fid, Between(lvalue, rvalue)))
}
fn equal(
fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<FieldId, FacetType>,
faceted_fields: &HashSet<FieldId>,
item: Pair<Rule>,
) -> anyhow::Result<FacetCondition>
{
let mut items = item.into_inner();
let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?;
let fid = field_id(fields_ids_map, faceted_fields, &mut items)?;
let value = items.next().unwrap();
match ftype {
FacetType::String => Ok(OperatorString(fid, FacetStringOperator::equal(value.as_str()))),
FacetType::Number => Ok(OperatorNumber(fid, Equal(pest_parse(value)?))),
}
let (result, svalue) = pest_parse(value);
let svalue = svalue.to_lowercase();
Ok(Operator(fid, Equal(result.ok(), svalue)))
}
fn greater_than(
fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<FieldId, FacetType>,
faceted_fields: &HashSet<FieldId>,
item: Pair<Rule>,
) -> anyhow::Result<FacetCondition>
{
let item_span = item.as_span();
let mut items = item.into_inner();
let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?;
let fid = field_id(fields_ids_map, faceted_fields, &mut items)?;
let value = items.next().unwrap();
match ftype {
FacetType::String => {
Err(PestError::<Rule>::new_from_span(
ErrorVariant::CustomError {
message: "invalid operator on a faceted string".to_string(),
},
item_span,
).into())
},
FacetType::Number => Ok(OperatorNumber(fid, GreaterThan(pest_parse(value)?))),
}
let (result, _svalue) = pest_parse(value);
Ok(Operator(fid, GreaterThan(result?)))
}
fn greater_than_or_equal(
fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<FieldId, FacetType>,
faceted_fields: &HashSet<FieldId>,
item: Pair<Rule>,
) -> anyhow::Result<FacetCondition>
{
let item_span = item.as_span();
let mut items = item.into_inner();
let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?;
let fid = field_id(fields_ids_map, faceted_fields, &mut items)?;
let value = items.next().unwrap();
match ftype {
FacetType::String => {
Err(PestError::<Rule>::new_from_span(
ErrorVariant::CustomError {
message: "invalid operator on a faceted string".to_string(),
},
item_span,
).into())
},
FacetType::Number => Ok(OperatorNumber(fid, GreaterThanOrEqual(pest_parse(value)?))),
}
let (result, _svalue) = pest_parse(value);
Ok(Operator(fid, GreaterThanOrEqual(result?)))
}
fn lower_than(
fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<FieldId, FacetType>,
faceted_fields: &HashSet<FieldId>,
item: Pair<Rule>,
) -> anyhow::Result<FacetCondition>
{
let item_span = item.as_span();
let mut items = item.into_inner();
let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?;
let fid = field_id(fields_ids_map, faceted_fields, &mut items)?;
let value = items.next().unwrap();
match ftype {
FacetType::String => {
Err(PestError::<Rule>::new_from_span(
ErrorVariant::CustomError {
message: "invalid operator on a faceted string".to_string(),
},
item_span,
).into())
},
FacetType::Number => Ok(OperatorNumber(fid, LowerThan(pest_parse(value)?))),
}
let (result, _svalue) = pest_parse(value);
Ok(Operator(fid, LowerThan(result?)))
}
fn lower_than_or_equal(
fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<FieldId, FacetType>,
faceted_fields: &HashSet<FieldId>,
item: Pair<Rule>,
) -> anyhow::Result<FacetCondition>
{
let item_span = item.as_span();
let mut items = item.into_inner();
let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?;
let fid = field_id(fields_ids_map, faceted_fields, &mut items)?;
let value = items.next().unwrap();
match ftype {
FacetType::String => {
Err(PestError::<Rule>::new_from_span(
ErrorVariant::CustomError {
message: "invalid operator on a faceted string".to_string(),
},
item_span,
).into())
},
FacetType::Number => Ok(OperatorNumber(fid, LowerThanOrEqual(pest_parse(value)?))),
}
let (result, _svalue) = pest_parse(value);
Ok(Operator(fid, LowerThanOrEqual(result?)))
}
}
@ -485,34 +381,53 @@ impl FacetCondition {
Ok(())
}
fn evaluate_number_operator<>(
fn evaluate_operator(
rtxn: &heed::RoTxn,
index: &Index,
db: heed::Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
numbers_db: heed::Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
strings_db: heed::Database<FacetValueStringCodec, CboRoaringBitmapCodec>,
field_id: FieldId,
operator: FacetNumberOperator,
operator: &Operator,
) -> anyhow::Result<RoaringBitmap>
{
// Make sure we always bound the ranges with the field id and the level,
// as the facets values are all in the same database and prefixed by the
// field id and the level.
let (left, right) = match operator {
GreaterThan(val) => (Excluded(val), Included(f64::MAX)),
GreaterThanOrEqual(val) => (Included(val), Included(f64::MAX)),
Equal(val) => (Included(val), Included(val)),
NotEqual(val) => {
let all_documents_ids = index.faceted_documents_ids(rtxn, field_id)?;
let docids = Self::evaluate_number_operator(rtxn, index, db, field_id, Equal(val))?;
return Ok(all_documents_ids - docids);
GreaterThan(val) => (Excluded(*val), Included(f64::MAX)),
GreaterThanOrEqual(val) => (Included(*val), Included(f64::MAX)),
Equal(number, string) => {
let string_docids = strings_db.get(rtxn, &(field_id, &string))?.unwrap_or_default();
let number_docids = match number {
Some(n) => {
let n = Included(*n);
let mut output = RoaringBitmap::new();
Self::explore_facet_number_levels(rtxn, numbers_db, field_id, 0, n, n, &mut output)?;
output
},
LowerThan(val) => (Included(f64::MIN), Excluded(val)),
LowerThanOrEqual(val) => (Included(f64::MIN), Included(val)),
Between(left, right) => (Included(left), Included(right)),
None => RoaringBitmap::new(),
};
return Ok(string_docids | number_docids);
},
NotEqual(number, string) => {
let all_numbers_ids = if number.is_some() {
index.number_faceted_documents_ids(rtxn, field_id)?
} else {
RoaringBitmap::new()
};
let all_strings_ids = index.string_faceted_documents_ids(rtxn, field_id)?;
let operator = Equal(*number, string.clone());
let docids = Self::evaluate_operator(rtxn, index, numbers_db, strings_db, field_id, &operator)?;
return Ok((all_numbers_ids | all_strings_ids) - docids);
},
LowerThan(val) => (Included(f64::MIN), Excluded(*val)),
LowerThanOrEqual(val) => (Included(f64::MIN), Included(*val)),
Between(left, right) => (Included(*left), Included(*right)),
};
// Ask for the biggest value that can exist for this specific field, if it exists
// that's fine if it don't, the value just before will be returned instead.
let biggest_level = db
let biggest_level = numbers_db
.remap_data_type::<DecodeIgnore>()
.get_lower_than_or_equal_to(rtxn, &(field_id, u8::MAX, f64::MAX, f64::MAX))?
.and_then(|((id, level, _, _), _)| if id == field_id { Some(level) } else { None });
@ -520,52 +435,25 @@ impl FacetCondition {
match biggest_level {
Some(level) => {
let mut output = RoaringBitmap::new();
Self::explore_facet_number_levels(rtxn, db, field_id, level, left, right, &mut output)?;
Self::explore_facet_number_levels(rtxn, numbers_db, field_id, level, left, right, &mut output)?;
Ok(output)
},
None => Ok(RoaringBitmap::new()),
}
}
fn evaluate_string_operator(
rtxn: &heed::RoTxn,
index: &Index,
db: heed::Database<FacetValueStringCodec, CboRoaringBitmapCodec>,
field_id: FieldId,
operator: &FacetStringOperator,
) -> anyhow::Result<RoaringBitmap>
{
match operator {
FacetStringOperator::Equal(string) => {
match db.get(rtxn, &(field_id, string))? {
Some(docids) => Ok(docids),
None => Ok(RoaringBitmap::new())
}
},
FacetStringOperator::NotEqual(string) => {
let all_documents_ids = index.faceted_documents_ids(rtxn, field_id)?;
let op = FacetStringOperator::Equal(string.clone());
let docids = Self::evaluate_string_operator(rtxn, index, db, field_id, &op)?;
Ok(all_documents_ids - docids)
},
}
}
pub fn evaluate(
&self,
rtxn: &heed::RoTxn,
index: &Index,
) -> anyhow::Result<RoaringBitmap>
{
let db = index.facet_field_id_value_docids;
let numbers_db = index.facet_id_f64_docids;
let strings_db = index.facet_id_string_docids;
match self {
OperatorString(fid, op) => {
let db = db.remap_key_type::<FacetValueStringCodec>();
Self::evaluate_string_operator(rtxn, index, db, *fid, op)
},
OperatorNumber(fid, op) => {
let db = db.remap_key_type::<FacetLevelValueF64Codec>();
Self::evaluate_number_operator(rtxn, index, db, *fid, *op)
Operator(fid, op) => {
Self::evaluate_operator(rtxn, index, numbers_db, strings_db, *fid, op)
},
Or(lhs, rhs) => {
let lhs = lhs.evaluate(rtxn, index)?;
@ -586,7 +474,8 @@ mod tests {
use super::*;
use crate::update::Settings;
use heed::EnvOpenOptions;
use maplit::hashmap;
use maplit::hashset;
use big_s::S;
#[test]
fn string() {
@ -598,22 +487,22 @@ mod tests {
// Set the faceted fields to be the channel.
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, 0);
builder.set_faceted_fields(hashmap!{ "channel".into() => "string".into() });
builder.set_faceted_fields(hashset!{ S("channel") });
builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap();
// Test that the facet condition is correctly generated.
let rtxn = index.read_txn().unwrap();
let condition = FacetCondition::from_str(&rtxn, &index, "channel = ponce").unwrap();
let expected = OperatorString(0, FacetStringOperator::equal("Ponce"));
let condition = FacetCondition::from_str(&rtxn, &index, "channel = Ponce").unwrap();
let expected = Operator(0, Operator::Equal(None, S("ponce")));
assert_eq!(condition, expected);
let condition = FacetCondition::from_str(&rtxn, &index, "channel != ponce").unwrap();
let expected = OperatorString(0, FacetStringOperator::not_equal("ponce"));
let expected = Operator(0, Operator::NotEqual(None, S("ponce")));
assert_eq!(condition, expected);
let condition = FacetCondition::from_str(&rtxn, &index, "NOT channel = ponce").unwrap();
let expected = OperatorString(0, FacetStringOperator::not_equal("ponce"));
let expected = Operator(0, Operator::NotEqual(None, S("ponce")));
assert_eq!(condition, expected);
}
@ -627,20 +516,20 @@ mod tests {
// Set the faceted fields to be the channel.
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, 0);
builder.set_faceted_fields(hashmap!{ "timestamp".into() => "number".into() });
builder.set_faceted_fields(hashset!{ "timestamp".into() });
builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap();
// Test that the facet condition is correctly generated.
let rtxn = index.read_txn().unwrap();
let condition = FacetCondition::from_str(&rtxn, &index, "timestamp 22 TO 44").unwrap();
let expected = OperatorNumber(0, Between(22.0, 44.0));
let expected = Operator(0, Between(22.0, 44.0));
assert_eq!(condition, expected);
let condition = FacetCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap();
let expected = Or(
Box::new(OperatorNumber(0, LowerThan(22.0))),
Box::new(OperatorNumber(0, GreaterThan(44.0))),
Box::new(Operator(0, LowerThan(22.0))),
Box::new(Operator(0, GreaterThan(44.0))),
);
assert_eq!(condition, expected);
}
@ -655,11 +544,8 @@ mod tests {
// Set the faceted fields to be the channel.
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, 0);
builder.set_searchable_fields(vec!["channel".into(), "timestamp".into()]); // to keep the fields order
builder.set_faceted_fields(hashmap!{
"channel".into() => "string".into(),
"timestamp".into() => "number".into(),
});
builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order
builder.set_faceted_fields(hashset!{ S("channel"), S("timestamp") });
builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap();
@ -670,10 +556,10 @@ mod tests {
"channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)",
).unwrap();
let expected = Or(
Box::new(OperatorString(0, FacetStringOperator::equal("gotaga"))),
Box::new(Operator(0, Operator::Equal(None, S("gotaga")))),
Box::new(And(
Box::new(OperatorNumber(1, Between(22.0, 44.0))),
Box::new(OperatorString(0, FacetStringOperator::not_equal("ponce"))),
Box::new(Operator(1, Between(22.0, 44.0))),
Box::new(Operator(0, Operator::NotEqual(None, S("ponce")))),
))
);
assert_eq!(condition, expected);
@ -683,13 +569,13 @@ mod tests {
"channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)",
).unwrap();
let expected = Or(
Box::new(OperatorString(0, FacetStringOperator::equal("gotaga"))),
Box::new(Operator(0, Operator::Equal(None, S("gotaga")))),
Box::new(Or(
Box::new(Or(
Box::new(OperatorNumber(1, LowerThan(22.0))),
Box::new(OperatorNumber(1, GreaterThan(44.0))),
Box::new(Operator(1, LowerThan(22.0))),
Box::new(Operator(1, GreaterThan(44.0))),
)),
Box::new(OperatorString(0, FacetStringOperator::equal("ponce"))),
Box::new(Operator(0, Operator::Equal(None, S("ponce")))),
)),
);
assert_eq!(condition, expected);
@ -705,11 +591,8 @@ mod tests {
// Set the faceted fields to be the channel.
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, 0);
builder.set_searchable_fields(vec!["channel".into(), "timestamp".into()]); // to keep the fields order
builder.set_faceted_fields(hashmap!{
"channel".into() => "string".into(),
"timestamp".into() => "number".into(),
});
builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order
builder.set_faceted_fields(hashset!{ S("channel"), S("timestamp") });
builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap();
@ -717,7 +600,7 @@ mod tests {
let rtxn = index.read_txn().unwrap();
let condition = FacetCondition::from_array(
&rtxn, &index,
vec![Either::Right("channel:gotaga"), Either::Left(vec!["timestamp:44", "channel:-ponce"])],
vec![Either::Right("channel = gotaga"), Either::Left(vec!["timestamp = 44", "channel != ponce"])],
).unwrap().unwrap();
let expected = FacetCondition::from_str(
&rtxn, &index,

View File

@ -3,12 +3,12 @@ use std::ops::Bound::Unbounded;
use std::{cmp, fmt};
use anyhow::Context;
use heed::BytesDecode;
use heed::{Database, BytesDecode};
use heed::types::{ByteSlice, Unit};
use roaring::RoaringBitmap;
use crate::facet::{FacetType, FacetValue};
use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec};
use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec};
use crate::facet::FacetType;
use crate::heed_codec::facet::FacetValueStringCodec;
use crate::search::facet::{FacetIter, FacetRange};
use crate::{Index, FieldId, DocumentId};
@ -60,86 +60,81 @@ impl<'a> FacetDistribution<'a> {
/// There is a small amount of candidates OR we ask for facet string values so we
/// decide to iterate over the facet values of each one of them, one by one.
fn facet_values_from_documents(
fn facet_distribution_from_documents(
&self,
field_id: FieldId,
facet_type: FacetType,
candidates: &RoaringBitmap,
) -> heed::Result<BTreeMap<FacetValue, u64>>
distribution: &mut BTreeMap<String, u64>,
) -> heed::Result<()>
{
fn fetch_facet_values<'t, KC, K: 't>(
index: &Index,
rtxn: &'t heed::RoTxn,
db: Database<KC, Unit>,
field_id: FieldId,
candidates: &RoaringBitmap,
) -> heed::Result<BTreeMap<FacetValue, u64>>
distribution: &mut BTreeMap<String, u64>,
) -> heed::Result<()>
where
K: fmt::Display,
KC: BytesDecode<'t, DItem = (FieldId, DocumentId, K)>,
K: Into<FacetValue>,
{
let mut facet_values = BTreeMap::new();
let mut key_buffer = vec![field_id];
for docid in candidates.into_iter().take(CANDIDATES_THRESHOLD as usize) {
key_buffer.truncate(1);
key_buffer.extend_from_slice(&docid.to_be_bytes());
let iter = index.field_id_docid_facet_values
let iter = db
.remap_key_type::<ByteSlice>()
.prefix_iter(rtxn, &key_buffer)?
.remap_key_type::<KC>();
for result in iter {
let ((_, _, value), ()) = result?;
*facet_values.entry(value.into()).or_insert(0) += 1;
*distribution.entry(value.to_string()).or_insert(0) += 1;
}
}
Ok(facet_values)
Ok(())
}
let index = self.index;
let rtxn = self.rtxn;
match facet_type {
FacetType::String => {
fetch_facet_values::<FieldDocIdFacetStringCodec, _>(index, rtxn, field_id, candidates)
},
FacetType::Number => {
fetch_facet_values::<FieldDocIdFacetF64Codec, _>(index, rtxn, field_id, candidates)
let db = self.index.field_id_docid_facet_f64s;
fetch_facet_values(self.rtxn, db, field_id, candidates, distribution)
},
FacetType::String => {
let db = self.index.field_id_docid_facet_strings;
fetch_facet_values(self.rtxn, db, field_id, candidates, distribution)
}
}
}
/// There is too much documents, we use the facet levels to move throught
/// the facet values, to find the candidates and values associated.
fn facet_values_from_facet_levels(
fn facet_numbers_distribution_from_facet_levels(
&self,
field_id: FieldId,
facet_type: FacetType,
candidates: &RoaringBitmap,
) -> heed::Result<BTreeMap<FacetValue, u64>>
distribution: &mut BTreeMap<String, u64>,
) -> heed::Result<()>
{
let iter = match facet_type {
FacetType::String => unreachable!(),
FacetType::Number => {
let iter = FacetIter::new_non_reducing(
self.rtxn, self.index, field_id, candidates.clone(),
)?;
iter.map(|r| r.map(|(v, docids)| (FacetValue::from(v), docids)))
},
};
let mut facet_values = BTreeMap::new();
for result in iter {
let (value, mut docids) = result?;
docids.intersect_with(candidates);
if !docids.is_empty() {
facet_values.insert(value, docids.len());
distribution.insert(value.to_string(), docids.len());
}
if facet_values.len() == self.max_values_by_facet {
if distribution.len() == self.max_values_by_facet {
break;
}
}
Ok(facet_values)
Ok(())
}
/// Placeholder search, a.k.a. no candidates were specified. We iterate throught the
@ -147,80 +142,73 @@ impl<'a> FacetDistribution<'a> {
fn facet_values_from_raw_facet_database(
&self,
field_id: FieldId,
facet_type: FacetType,
) -> heed::Result<BTreeMap<FacetValue, u64>>
) -> heed::Result<BTreeMap<String, u64>>
{
let db = self.index.facet_field_id_value_docids;
let level = 0;
let iter = match facet_type {
FacetType::String => {
let iter = db
.prefix_iter(self.rtxn, &[field_id])?
.remap_key_type::<FacetValueStringCodec>()
.map(|r| r.map(|((_, v), docids)| (FacetValue::from(v), docids)));
Box::new(iter) as Box::<dyn Iterator<Item=_>>
},
FacetType::Number => {
let db = db.remap_key_type::<FacetLevelValueF64Codec>();
let range = FacetRange::new(
self.rtxn, db, field_id, level, Unbounded, Unbounded,
)?;
Box::new(range.map(|r| r.map(|((_, _, v, _), docids)| (FacetValue::from(v), docids))))
},
};
let mut distribution = BTreeMap::new();
let mut facet_values = BTreeMap::new();
for result in iter {
let (value, docids) = result?;
facet_values.insert(value, docids.len());
if facet_values.len() == self.max_values_by_facet {
let db = self.index.facet_id_f64_docids;
let range = FacetRange::new(self.rtxn, db, field_id, 0, Unbounded, Unbounded)?;
for result in range {
let ((_, _, value, _), docids) = result?;
distribution.insert(value.to_string(), docids.len());
if distribution.len() == self.max_values_by_facet {
break;
}
}
Ok(facet_values)
let iter = self.index
.facet_id_string_docids
.remap_key_type::<ByteSlice>()
.prefix_iter(self.rtxn, &[field_id])?
.remap_key_type::<FacetValueStringCodec>();
for result in iter {
let ((_, value), docids) = result?;
distribution.insert(value.to_string(), docids.len());
if distribution.len() == self.max_values_by_facet {
break;
}
}
fn facet_values(
&self,
field_id: FieldId,
facet_type: FacetType,
) -> heed::Result<BTreeMap<FacetValue, u64>>
{
Ok(distribution)
}
fn facet_values(&self, field_id: FieldId) -> heed::Result<BTreeMap<String, u64>> {
use FacetType::{Number, String};
if let Some(candidates) = self.candidates.as_ref() {
// Classic search, candidates were specified, we must return facet values only related
// to those candidates. We also enter here for facet strings for performance reasons.
if candidates.len() <= CANDIDATES_THRESHOLD || facet_type == FacetType::String {
self.facet_values_from_documents(field_id, facet_type, candidates)
let mut distribution = BTreeMap::new();
if candidates.len() <= CANDIDATES_THRESHOLD {
self.facet_distribution_from_documents(field_id, Number, candidates, &mut distribution)?;
self.facet_distribution_from_documents(field_id, String, candidates, &mut distribution)?;
} else {
self.facet_values_from_facet_levels(field_id, facet_type, candidates)
self.facet_numbers_distribution_from_facet_levels(field_id, candidates, &mut distribution)?;
self.facet_distribution_from_documents(field_id, String, candidates, &mut distribution)?;
}
Ok(distribution)
} else {
self.facet_values_from_raw_facet_database(field_id, facet_type)
self.facet_values_from_raw_facet_database(field_id)
}
}
pub fn execute(&self) -> anyhow::Result<BTreeMap<String, BTreeMap<FacetValue, u64>>> {
pub fn execute(&self) -> anyhow::Result<BTreeMap<String, BTreeMap<String, u64>>> {
let fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
let faceted_fields = self.index.faceted_fields(self.rtxn)?;
let fields_ids: Vec<_> = match &self.facets {
Some(names) => names
.iter()
.filter_map(|n| faceted_fields.get(n).map(|t| (n.to_string(), *t)))
.collect(),
None => faceted_fields.into_iter().collect(),
};
let mut facets_values = BTreeMap::new();
for (name, ftype) in fields_ids {
let mut distribution = BTreeMap::new();
for name in faceted_fields {
let fid = fields_ids_map.id(&name).with_context(|| {
format!("missing field name {:?} from the fields id map", name)
})?;
let values = self.facet_values(fid, ftype)?;
facets_values.insert(name, values);
let values = self.facet_values(fid)?;
distribution.insert(name, values);
}
Ok(facets_values)
Ok(distribution)
}
}

View File

@ -9,7 +9,7 @@ use crate::heed_codec::CboRoaringBitmapCodec;
use crate::heed_codec::facet::FacetLevelValueF64Codec;
use crate::{Index, FieldId};
pub use self::facet_condition::{FacetCondition, FacetNumberOperator, FacetStringOperator};
pub use self::facet_condition::{FacetCondition, Operator};
pub use self::facet_distribution::FacetDistribution;
mod facet_condition;
@ -140,7 +140,7 @@ impl<'t> FacetIter<'t> {
documents_ids: RoaringBitmap,
) -> heed::Result<FacetIter<'t>>
{
let db = index.facet_field_id_value_docids.remap_key_type::<FacetLevelValueF64Codec>();
let db = index.facet_id_f64_docids.remap_key_type::<FacetLevelValueF64Codec>();
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
let level_iters = vec![(documents_ids, Left(highest_iter))];
@ -157,7 +157,7 @@ impl<'t> FacetIter<'t> {
documents_ids: RoaringBitmap,
) -> heed::Result<FacetIter<'t>>
{
let db = index.facet_field_id_value_docids.remap_key_type::<FacetLevelValueF64Codec>();
let db = index.facet_id_f64_docids.remap_key_type::<FacetLevelValueF64Codec>();
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
let highest_iter = FacetRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
let level_iters = vec![(documents_ids, Right(highest_iter))];
@ -175,7 +175,7 @@ impl<'t> FacetIter<'t> {
documents_ids: RoaringBitmap,
) -> heed::Result<FacetIter<'t>>
{
let db = index.facet_field_id_value_docids.remap_key_type::<FacetLevelValueF64Codec>();
let db = index.facet_id_f64_docids.remap_key_type::<FacetLevelValueF64Codec>();
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
let level_iters = vec![(documents_ids, Left(highest_iter))];

View File

@ -16,9 +16,7 @@ use distinct::{Distinct, DocIter, FacetDistinct, MapDistinct, NoopDistinct};
use crate::search::criteria::r#final::{Final, FinalResult};
use crate::{Index, DocumentId};
pub use self::facet::{
FacetCondition, FacetDistribution, FacetIter, FacetNumberOperator, FacetStringOperator,
};
pub use self::facet::{FacetCondition, FacetDistribution, FacetIter, Operator};
pub use self::query_tree::MatchingWords;
use self::query_tree::QueryTreeBuilder;
@ -143,19 +141,16 @@ impl<'a> Search<'a> {
let field_ids_map = self.index.fields_ids_map(self.rtxn)?;
let id = field_ids_map.id(name).expect("distinct not present in field map");
let faceted_fields = self.index.faceted_fields(self.rtxn)?;
match faceted_fields.get(name) {
Some(facet_type) => {
let distinct = FacetDistinct::new(id, self.index, self.rtxn, *facet_type);
if faceted_fields.contains(name) {
let distinct = FacetDistinct::new(id, self.index, self.rtxn);
self.perform_sort(distinct, matching_words, criteria)
}
None => {
} else {
let distinct = MapDistinct::new(id, self.index, self.rtxn);
self.perform_sort(distinct, matching_words, criteria)
}
}
}
}
}
fn perform_sort(
&self,

View File

@ -30,8 +30,10 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
word_prefix_pair_proximity_docids,
word_level_position_docids,
word_prefix_level_position_docids,
facet_field_id_value_docids,
field_id_docid_facet_values,
facet_id_f64_docids,
facet_id_string_docids,
field_id_docid_facet_f64s,
field_id_docid_facet_strings,
documents,
} = self.index;
@ -47,8 +49,10 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
self.index.put_fields_distribution(self.wtxn, &FieldsDistribution::default())?;
// We clean all the faceted documents ids.
for (field_id, _) in faceted_fields {
self.index.put_faceted_documents_ids(self.wtxn, field_id, &RoaringBitmap::default())?;
let empty = RoaringBitmap::default();
for field_id in faceted_fields {
self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &empty)?;
self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &empty)?;
}
// Clear the other databases.
@ -59,8 +63,10 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
word_prefix_pair_proximity_docids.clear(self.wtxn)?;
word_level_position_docids.clear(self.wtxn)?;
word_prefix_level_position_docids.clear(self.wtxn)?;
facet_field_id_value_docids.clear(self.wtxn)?;
field_id_docid_facet_values.clear(self.wtxn)?;
facet_id_f64_docids.clear(self.wtxn)?;
facet_id_string_docids.clear(self.wtxn)?;
field_id_docid_facet_f64s.clear(self.wtxn)?;
field_id_docid_facet_strings.clear(self.wtxn)?;
documents.clear(self.wtxn)?;
Ok(number_of_documents)
@ -112,8 +118,10 @@ mod tests {
assert!(index.docid_word_positions.is_empty(&rtxn).unwrap());
assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap());
assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap());
assert!(index.facet_field_id_value_docids.is_empty(&rtxn).unwrap());
assert!(index.field_id_docid_facet_values.is_empty(&rtxn).unwrap());
assert!(index.facet_id_f64_docids.is_empty(&rtxn).unwrap());
assert!(index.facet_id_string_docids.is_empty(&rtxn).unwrap());
assert!(index.field_id_docid_facet_f64s.is_empty(&rtxn).unwrap());
assert!(index.field_id_docid_facet_strings.is_empty(&rtxn).unwrap());
assert!(index.documents.is_empty(&rtxn).unwrap());
}
}

View File

@ -4,13 +4,12 @@ use std::collections::hash_map::Entry;
use anyhow::anyhow;
use chrono::Utc;
use fst::IntoStreamer;
use heed::types::ByteSlice;
use heed::types::{ByteSlice, Unit};
use roaring::RoaringBitmap;
use serde_json::Value;
use crate::facet::FacetType;
use crate::{Index, BEU32, SmallString32, ExternalDocumentsIds};
use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec};
use crate::heed_codec::CboRoaringBitmapCodec;
use crate::{Index, DocumentId, FieldId, BEU32, SmallString32, ExternalDocumentsIds};
use super::ClearDocuments;
pub struct DeleteDocuments<'t, 'u, 'i> {
@ -90,8 +89,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
word_prefix_pair_proximity_docids,
word_level_position_docids,
word_prefix_level_position_docids,
facet_field_id_value_docids,
field_id_docid_facet_values,
facet_id_f64_docids,
facet_id_string_docids,
field_id_docid_facet_f64s,
field_id_docid_facet_strings,
documents,
} = self.index;
@ -285,52 +286,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
drop(iter);
// Remove the documents ids from the faceted documents ids.
let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
for (field_id, facet_type) in faceted_fields {
let mut docids = self.index.faceted_documents_ids(self.wtxn, field_id)?;
docids.difference_with(&self.documents_ids);
self.index.put_faceted_documents_ids(self.wtxn, field_id, &docids)?;
// We delete the entries that are part of the documents ids.
let iter = field_id_docid_facet_values.prefix_iter_mut(self.wtxn, &[field_id])?;
match facet_type {
FacetType::String => {
let mut iter = iter.remap_key_type::<FieldDocIdFacetStringCodec>();
while let Some(result) = iter.next() {
let ((_fid, docid, _value), ()) = result?;
if self.documents_ids.contains(docid) {
iter.del_current()?;
}
}
},
FacetType::Number => {
let mut iter = iter.remap_key_type::<FieldDocIdFacetF64Codec>();
while let Some(result) = iter.next() {
let ((_fid, docid, _value), ()) = result?;
if self.documents_ids.contains(docid) {
iter.del_current()?;
}
}
},
}
}
// We delete the documents ids that are under the facet field id values.
let mut iter = facet_field_id_value_docids.iter_mut(self.wtxn)?;
while let Some(result) = iter.next() {
let (bytes, mut docids) = result?;
let previous_len = docids.len();
docids.difference_with(&self.documents_ids);
if docids.is_empty() {
iter.del_current()?;
} else if docids.len() != previous_len {
iter.put_current(bytes, &docids)?;
}
}
drop(iter);
// We delete the documents ids that are under the word level position docids.
let mut iter = word_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
while let Some(result) = iter.next() {
@ -361,10 +316,100 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
drop(iter);
// We delete the documents ids that are under the facet field id values.
remove_docids_from_facet_field_id_value_docids(
self.wtxn,
facet_id_f64_docids,
&self.documents_ids,
)?;
remove_docids_from_facet_field_id_value_docids(
self.wtxn,
facet_id_string_docids,
&self.documents_ids,
)?;
// Remove the documents ids from the faceted documents ids.
for field_id in self.index.faceted_fields_ids(self.wtxn)? {
// Remove docids from the number faceted documents ids
let mut docids = self.index.number_faceted_documents_ids(self.wtxn, field_id)?;
docids.difference_with(&self.documents_ids);
self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &docids)?;
remove_docids_from_field_id_docid_facet_value(
self.wtxn,
field_id_docid_facet_f64s,
field_id,
&self.documents_ids,
|(_fid, docid, _value)| docid,
)?;
// Remove docids from the string faceted documents ids
let mut docids = self.index.string_faceted_documents_ids(self.wtxn, field_id)?;
docids.difference_with(&self.documents_ids);
self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &docids)?;
remove_docids_from_field_id_docid_facet_value(
self.wtxn,
field_id_docid_facet_strings,
field_id,
&self.documents_ids,
|(_fid, docid, _value)| docid,
)?;
}
Ok(self.documents_ids.len())
}
}
fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F>(
wtxn: &'a mut heed::RwTxn,
db: &heed::Database<C, Unit>,
field_id: FieldId,
to_remove: &RoaringBitmap,
convert: F,
) -> heed::Result<()>
where
C: heed::BytesDecode<'a, DItem=K> + heed::BytesEncode<'a, EItem=K>,
F: Fn(K) -> DocumentId,
{
let mut iter = db.remap_key_type::<ByteSlice>()
.prefix_iter_mut(wtxn, &[field_id])?
.remap_key_type::<C>();
while let Some(result) = iter.next() {
let (key, ()) = result?;
if to_remove.contains(convert(key)) {
iter.del_current()?;
}
}
Ok(())
}
fn remove_docids_from_facet_field_id_value_docids<'a, C>(
wtxn: &'a mut heed::RwTxn,
db: &heed::Database<C, CboRoaringBitmapCodec>,
to_remove: &RoaringBitmap,
) -> heed::Result<()>
where
C: heed::BytesDecode<'a> + heed::BytesEncode<'a>,
{
let mut iter = db.remap_key_type::<ByteSlice>().iter_mut(wtxn)?;
while let Some(result) = iter.next() {
let (bytes, mut docids) = result?;
let previous_len = docids.len();
docids.difference_with(to_remove);
if docids.is_empty() {
iter.del_current()?;
} else if docids.len() != previous_len {
iter.put_current(bytes, &docids)?;
}
}
Ok(())
}
#[cfg(test)]
mod tests {
use heed::EnvOpenOptions;

View File

@ -9,7 +9,6 @@ use heed::{BytesEncode, Error};
use log::debug;
use roaring::RoaringBitmap;
use crate::facet::FacetType;
use crate::heed_codec::CboRoaringBitmapCodec;
use crate::heed_codec::facet::FacetLevelValueF64Codec;
use crate::Index;
@ -62,33 +61,32 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
for (field_id, facet_type) in faceted_fields {
let (content, documents_ids) = match facet_type {
FacetType::String => {
let documents_ids = compute_faceted_documents_ids(
for field_id in faceted_fields {
// Compute and store the faceted strings documents ids.
let string_documents_ids = compute_faceted_documents_ids(
self.wtxn,
self.index.facet_field_id_value_docids,
self.index.facet_id_string_docids.remap_key_type::<ByteSlice>(),
field_id,
)?;
(None, documents_ids)
},
FacetType::Number => {
// Clear the facet number levels.
clear_field_number_levels(
self.wtxn,
self.index.facet_field_id_value_docids.remap_key_type::<FacetLevelValueF64Codec>(),
self.index.facet_id_f64_docids,
field_id,
)?;
let documents_ids = compute_faceted_documents_ids(
// Compute and store the faceted numbers documents ids.
let number_documents_ids = compute_faceted_documents_ids(
self.wtxn,
self.index.facet_field_id_value_docids,
self.index.facet_id_f64_docids.remap_key_type::<ByteSlice>(),
field_id,
)?;
let content = compute_facet_number_levels(
self.wtxn,
self.index.facet_field_id_value_docids.remap_key_type::<FacetLevelValueF64Codec>(),
self.index.facet_id_f64_docids,
self.chunk_compression_type,
self.chunk_compression_level,
self.chunk_fusing_shrink_size,
@ -97,23 +95,18 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
field_id,
)?;
(Some(content), documents_ids)
},
};
self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &string_documents_ids)?;
self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &number_documents_ids)?;
if let Some(content) = content {
write_into_lmdb_database(
self.wtxn,
*self.index.facet_field_id_value_docids.as_polymorph(),
*self.index.facet_id_f64_docids.as_polymorph(),
content,
|_, _| anyhow::bail!("invalid facet level merging"),
|_, _| anyhow::bail!("invalid facet number level merging"),
WriteMethod::GetMergePut,
)?;
}
self.index.put_faceted_documents_ids(self.wtxn, field_id, &documents_ids)?;
}
Ok(())
}
}
@ -205,10 +198,12 @@ fn compute_faceted_documents_ids(
) -> anyhow::Result<RoaringBitmap>
{
let mut documents_ids = RoaringBitmap::new();
for result in db.prefix_iter(rtxn, &[field_id])? {
let (_key, docids) = result?;
documents_ids.union_with(&docids);
documents_ids |= docids;
}
Ok(documents_ids)
}

View File

@ -412,7 +412,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
Main,
WordDocids,
WordLevel0PositionDocids,
FacetLevel0ValuesDocids,
FacetLevel0NumbersDocids,
}
let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
@ -450,8 +450,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
.enumerate()
.map(|(i, documents)| {
let store = Store::new(
primary_key.clone(),
fields_ids_map.clone(),
searchable_fields.clone(),
faceted_fields.clone(),
linked_hash_map_size,
@ -478,8 +476,10 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
let mut docid_word_positions_readers = Vec::with_capacity(readers.len());
let mut words_pairs_proximities_docids_readers = Vec::with_capacity(readers.len());
let mut word_level_position_docids_readers = Vec::with_capacity(readers.len());
let mut facet_field_value_docids_readers = Vec::with_capacity(readers.len());
let mut field_id_docid_facet_values_readers = Vec::with_capacity(readers.len());
let mut facet_field_numbers_docids_readers = Vec::with_capacity(readers.len());
let mut facet_field_strings_docids_readers = Vec::with_capacity(readers.len());
let mut field_id_docid_facet_numbers_readers = Vec::with_capacity(readers.len());
let mut field_id_docid_facet_strings_readers = Vec::with_capacity(readers.len());
let mut documents_readers = Vec::with_capacity(readers.len());
readers.into_iter().for_each(|readers| {
let Readers {
@ -488,17 +488,21 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
docid_word_positions,
words_pairs_proximities_docids,
word_level_position_docids,
facet_field_value_docids,
field_id_docid_facet_values,
documents
facet_field_numbers_docids,
facet_field_strings_docids,
field_id_docid_facet_numbers,
field_id_docid_facet_strings,
documents,
} = readers;
main_readers.push(main);
word_docids_readers.push(word_docids);
docid_word_positions_readers.push(docid_word_positions);
words_pairs_proximities_docids_readers.push(words_pairs_proximities_docids);
word_level_position_docids_readers.push(word_level_position_docids);
facet_field_value_docids_readers.push(facet_field_value_docids);
field_id_docid_facet_values_readers.push(field_id_docid_facet_values);
facet_field_numbers_docids_readers.push(facet_field_numbers_docids);
facet_field_strings_docids_readers.push(facet_field_strings_docids);
field_id_docid_facet_numbers_readers.push(field_id_docid_facet_numbers);
field_id_docid_facet_strings_readers.push(field_id_docid_facet_strings);
documents_readers.push(documents);
});
@ -523,8 +527,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
(DatabaseType::Main, main_readers, main_merge as MergeFn),
(DatabaseType::WordDocids, word_docids_readers, word_docids_merge),
(
DatabaseType::FacetLevel0ValuesDocids,
facet_field_value_docids_readers,
DatabaseType::FacetLevel0NumbersDocids,
facet_field_numbers_docids_readers,
facet_field_value_docids_merge,
),
(
@ -547,7 +551,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
docid_word_positions_readers,
documents_readers,
words_pairs_proximities_docids_readers,
field_id_docid_facet_values_readers,
facet_field_strings_docids_readers,
field_id_docid_facet_numbers_readers,
field_id_docid_facet_strings_readers,
)) as anyhow::Result<_>
})?;
@ -556,7 +562,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
docid_word_positions_readers,
documents_readers,
words_pairs_proximities_docids_readers,
field_id_docid_facet_values_readers,
facet_field_strings_docids_readers,
field_id_docid_facet_numbers_readers,
field_id_docid_facet_strings_readers,
) = readers;
let mut documents_ids = self.index.documents_ids(self.wtxn)?;
@ -587,7 +595,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
self.index.put_documents_ids(self.wtxn, &documents_ids)?;
let mut database_count = 0;
let total_databases = 8;
let total_databases = 10;
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
databases_seen: 0,
@ -624,11 +632,41 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
total_databases,
});
debug!("Writing the field id docid facet values into LMDB on disk...");
debug!("Writing the facet id string docids into LMDB on disk...");
merge_into_lmdb_database(
self.wtxn,
*self.index.field_id_docid_facet_values.as_polymorph(),
field_id_docid_facet_values_readers,
*self.index.facet_id_string_docids.as_polymorph(),
facet_field_strings_docids_readers,
facet_field_value_docids_merge,
write_method,
)?;
database_count += 1;
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
databases_seen: database_count,
total_databases,
});
debug!("Writing the field id docid facet numbers into LMDB on disk...");
merge_into_lmdb_database(
self.wtxn,
*self.index.field_id_docid_facet_f64s.as_polymorph(),
field_id_docid_facet_numbers_readers,
field_id_docid_facet_values_merge,
write_method,
)?;
database_count += 1;
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
databases_seen: database_count,
total_databases,
});
debug!("Writing the field id docid facet strings into LMDB on disk...");
merge_into_lmdb_database(
self.wtxn,
*self.index.field_id_docid_facet_strings.as_polymorph(),
field_id_docid_facet_strings_readers,
field_id_docid_facet_values_merge,
write_method,
)?;
@ -678,9 +716,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
write_method,
)?;
},
DatabaseType::FacetLevel0ValuesDocids => {
debug!("Writing the facet level 0 values docids into LMDB on disk...");
let db = *self.index.facet_field_id_value_docids.as_polymorph();
DatabaseType::FacetLevel0NumbersDocids => {
debug!("Writing the facet numbers docids into LMDB on disk...");
let db = *self.index.facet_id_f64_docids.as_polymorph();
write_into_lmdb_database(
self.wtxn,
db,

View File

@ -6,25 +6,24 @@ use std::iter::FromIterator;
use std::time::Instant;
use std::{cmp, iter};
use anyhow::{bail, Context};
use anyhow::Context;
use bstr::ByteSlice as _;
use fst::Set;
use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType};
use heed::BytesEncode;
use linked_hash_map::LinkedHashMap;
use log::{debug, info, warn};
use log::{debug, info};
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind, token::SeparatorKind};
use ordered_float::OrderedFloat;
use roaring::RoaringBitmap;
use serde_json::Value;
use tempfile::tempfile;
use crate::facet::{FacetType, FacetValue};
use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec};
use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec};
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
use crate::update::UpdateIndexingStep;
use crate::{json_to_string, SmallVec8, SmallVec32, Position, DocumentId, FieldId, FieldsIdsMap};
use crate::{json_to_string, SmallVec32, Position, DocumentId, FieldId};
use super::{MergeFn, create_writer, create_sorter, writer_into_reader};
use super::merge_function::{
@ -45,23 +44,24 @@ pub struct Readers {
pub docid_word_positions: Reader<FileFuse>,
pub words_pairs_proximities_docids: Reader<FileFuse>,
pub word_level_position_docids: Reader<FileFuse>,
pub facet_field_value_docids: Reader<FileFuse>,
pub field_id_docid_facet_values: Reader<FileFuse>,
pub facet_field_numbers_docids: Reader<FileFuse>,
pub facet_field_strings_docids: Reader<FileFuse>,
pub field_id_docid_facet_numbers: Reader<FileFuse>,
pub field_id_docid_facet_strings: Reader<FileFuse>,
pub documents: Reader<FileFuse>,
}
pub struct Store<'s, A> {
// Indexing parameters
primary_key: String,
fields_ids_map: FieldsIdsMap,
searchable_fields: HashSet<FieldId>,
faceted_fields: HashMap<FieldId, FacetType>,
faceted_fields: HashSet<FieldId>,
// Caches
word_docids: LinkedHashMap<SmallVec32<u8>, RoaringBitmap>,
word_docids_limit: usize,
words_pairs_proximities_docids: LinkedHashMap<(SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap>,
words_pairs_proximities_docids_limit: usize,
facet_field_value_docids: LinkedHashMap<(u8, FacetValue), RoaringBitmap>,
facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat<f64>), RoaringBitmap>,
facet_field_string_docids: LinkedHashMap<(FieldId, String), RoaringBitmap>,
facet_field_value_docids_limit: usize,
// MTBL parameters
chunk_compression_type: CompressionType,
@ -72,8 +72,10 @@ pub struct Store<'s, A> {
word_docids_sorter: Sorter<MergeFn>,
words_pairs_proximities_docids_sorter: Sorter<MergeFn>,
word_level_position_docids_sorter: Sorter<MergeFn>,
facet_field_value_docids_sorter: Sorter<MergeFn>,
field_id_docid_facet_values_sorter: Sorter<MergeFn>,
facet_field_numbers_docids_sorter: Sorter<MergeFn>,
facet_field_strings_docids_sorter: Sorter<MergeFn>,
field_id_docid_facet_numbers_sorter: Sorter<MergeFn>,
field_id_docid_facet_strings_sorter: Sorter<MergeFn>,
// MTBL writers
docid_word_positions_writer: Writer<File>,
documents_writer: Writer<File>,
@ -83,10 +85,8 @@ pub struct Store<'s, A> {
impl<'s, A: AsRef<[u8]>> Store<'s, A> {
pub fn new(
primary_key: String,
fields_ids_map: FieldsIdsMap,
searchable_fields: HashSet<FieldId>,
faceted_fields: HashMap<FieldId, FacetType>,
faceted_fields: HashSet<FieldId>,
linked_hash_map_size: Option<usize>,
max_nb_chunks: Option<usize>,
max_memory: Option<usize>,
@ -132,7 +132,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
max_nb_chunks,
max_memory,
);
let facet_field_value_docids_sorter = create_sorter(
let facet_field_numbers_docids_sorter = create_sorter(
facet_field_value_docids_merge,
chunk_compression_type,
chunk_compression_level,
@ -140,7 +140,23 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
max_nb_chunks,
max_memory,
);
let field_id_docid_facet_values_sorter = create_sorter(
let facet_field_strings_docids_sorter = create_sorter(
facet_field_value_docids_merge,
chunk_compression_type,
chunk_compression_level,
chunk_fusing_shrink_size,
max_nb_chunks,
max_memory,
);
let field_id_docid_facet_numbers_sorter = create_sorter(
field_id_docid_facet_values_merge,
chunk_compression_type,
chunk_compression_level,
chunk_fusing_shrink_size,
max_nb_chunks,
Some(1024 * 1024 * 1024), // 1MB
);
let field_id_docid_facet_strings_sorter = create_sorter(
field_id_docid_facet_values_merge,
chunk_compression_type,
chunk_compression_level,
@ -164,8 +180,6 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
Ok(Store {
// Indexing parameters.
primary_key,
fields_ids_map,
searchable_fields,
faceted_fields,
// Caches
@ -173,7 +187,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
word_docids_limit: linked_hash_map_size,
words_pairs_proximities_docids: LinkedHashMap::with_capacity(linked_hash_map_size),
words_pairs_proximities_docids_limit: linked_hash_map_size,
facet_field_value_docids: LinkedHashMap::with_capacity(linked_hash_map_size),
facet_field_number_docids: LinkedHashMap::with_capacity(linked_hash_map_size),
facet_field_string_docids: LinkedHashMap::with_capacity(linked_hash_map_size),
facet_field_value_docids_limit: linked_hash_map_size,
// MTBL parameters
chunk_compression_type,
@ -184,8 +199,10 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
word_docids_sorter,
words_pairs_proximities_docids_sorter,
word_level_position_docids_sorter,
facet_field_value_docids_sorter,
field_id_docid_facet_values_sorter,
facet_field_numbers_docids_sorter,
facet_field_strings_docids_sorter,
field_id_docid_facet_numbers_sorter,
field_id_docid_facet_strings_sorter,
// MTBL writers
docid_word_positions_writer,
documents_writer,
@ -215,34 +232,68 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
Ok(())
}
// Save the documents ids under the facet field id and value we have seen it.
fn insert_facet_values_docid(
fn insert_facet_number_values_docid(
&mut self,
field_id: FieldId,
field_value: FacetValue,
value: OrderedFloat<f64>,
id: DocumentId,
) -> anyhow::Result<()>
{
Self::write_field_id_docid_facet_value(&mut self.field_id_docid_facet_values_sorter, field_id, id, &field_value)?;
let sorter = &mut self.field_id_docid_facet_numbers_sorter;
Self::write_field_id_docid_facet_number_value(sorter, field_id, id, value)?;
let key = (field_id, field_value);
let key = (field_id, value);
// if get_refresh finds the element it is assured to be at the end of the linked hash map.
match self.facet_field_value_docids.get_refresh(&key) {
match self.facet_field_number_docids.get_refresh(&key) {
Some(old) => { old.insert(id); },
None => {
// A newly inserted element is append at the end of the linked hash map.
self.facet_field_value_docids.insert(key, RoaringBitmap::from_iter(Some(id)));
self.facet_field_number_docids.insert(key, RoaringBitmap::from_iter(Some(id)));
// If the word docids just reached it's capacity we must make sure to remove
// one element, this way next time we insert we doesn't grow the capacity.
if self.facet_field_value_docids.len() == self.facet_field_value_docids_limit {
if self.facet_field_number_docids.len() == self.facet_field_value_docids_limit {
// Removing the front element is equivalent to removing the LRU element.
Self::write_facet_field_value_docids(
&mut self.facet_field_value_docids_sorter,
self.facet_field_value_docids.pop_front(),
Self::write_facet_field_number_docids(
&mut self.facet_field_numbers_docids_sorter,
self.facet_field_number_docids.pop_front(),
)?;
}
}
}
Ok(())
}
// Save the documents ids under the facet field id and value we have seen it.
fn insert_facet_string_values_docid(
&mut self,
field_id: FieldId,
value: String,
id: DocumentId,
) -> anyhow::Result<()>
{
let sorter = &mut self.field_id_docid_facet_strings_sorter;
Self::write_field_id_docid_facet_string_value(sorter, field_id, id, &value)?;
let key = (field_id, value);
// if get_refresh finds the element it is assured to be at the end of the linked hash map.
match self.facet_field_string_docids.get_refresh(&key) {
Some(old) => { old.insert(id); },
None => {
// A newly inserted element is append at the end of the linked hash map.
self.facet_field_string_docids.insert(key, RoaringBitmap::from_iter(Some(id)));
// If the word docids just reached it's capacity we must make sure to remove
// one element, this way next time we insert we doesn't grow the capacity.
if self.facet_field_string_docids.len() == self.facet_field_value_docids_limit {
// Removing the front element is equivalent to removing the LRU element.
Self::write_facet_field_string_docids(
&mut self.facet_field_strings_docids_sorter,
self.facet_field_string_docids.pop_front(),
)?;
}
}
}
Ok(())
}
@ -287,7 +338,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
&mut self,
document_id: DocumentId,
words_positions: &mut HashMap<String, SmallVec32<Position>>,
facet_values: &mut HashMap<FieldId, SmallVec8<FacetValue>>,
facet_numbers_values: &mut HashMap<FieldId, Vec<f64>>,
facet_strings_values: &mut HashMap<FieldId, Vec<String>>,
record: &[u8],
) -> anyhow::Result<()>
{
@ -306,10 +358,18 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
words_positions.clear();
// We store document_id associated with all the field id and values.
for (field, values) in facet_values.drain() {
// We store document_id associated with all the facet numbers fields ids and values.
for (field, values) in facet_numbers_values.drain() {
for value in values {
self.insert_facet_values_docid(field, value, document_id)?;
let value = OrderedFloat::from(value);
self.insert_facet_number_values_docid(field, value, document_id)?;
}
}
// We store document_id associated with all the facet strings fields ids and values.
for (field, values) in facet_strings_values.drain() {
for value in values {
self.insert_facet_string_values_docid(field, value, document_id)?;
}
}
@ -409,20 +469,16 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
Ok(())
}
fn write_facet_field_value_docids<I>(
fn write_facet_field_string_docids<I>(
sorter: &mut Sorter<MergeFn>,
iter: I,
) -> anyhow::Result<()>
where I: IntoIterator<Item=((FieldId, FacetValue), RoaringBitmap)>
where I: IntoIterator<Item=((FieldId, String), RoaringBitmap)>
{
use FacetValue::*;
for ((field_id, value), docids) in iter {
let result = match value {
String(s) => FacetValueStringCodec::bytes_encode(&(field_id, &s)).map(Cow::into_owned),
Number(f) => FacetLevelValueF64Codec::bytes_encode(&(field_id, 0, *f, *f)).map(Cow::into_owned),
};
let key = result.context("could not serialize facet key")?;
let key = FacetValueStringCodec::bytes_encode(&(field_id, &value))
.map(Cow::into_owned)
.context("could not serialize facet key")?;
let bytes = CboRoaringBitmapCodec::bytes_encode(&docids)
.context("could not serialize docids")?;
if lmdb_key_valid_size(&key) {
@ -433,21 +489,55 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
Ok(())
}
fn write_field_id_docid_facet_value(
fn write_facet_field_number_docids<I>(
sorter: &mut Sorter<MergeFn>,
iter: I,
) -> anyhow::Result<()>
where I: IntoIterator<Item=((FieldId, OrderedFloat<f64>), RoaringBitmap)>
{
for ((field_id, value), docids) in iter {
let key = FacetLevelValueF64Codec::bytes_encode(&(field_id, 0, *value, *value))
.map(Cow::into_owned)
.context("could not serialize facet key")?;
let bytes = CboRoaringBitmapCodec::bytes_encode(&docids)
.context("could not serialize docids")?;
if lmdb_key_valid_size(&key) {
sorter.insert(&key, &bytes)?;
}
}
Ok(())
}
fn write_field_id_docid_facet_number_value(
sorter: &mut Sorter<MergeFn>,
field_id: FieldId,
document_id: DocumentId,
value: &FacetValue,
value: OrderedFloat<f64>,
) -> anyhow::Result<()>
{
use FacetValue::*;
let key = FieldDocIdFacetF64Codec::bytes_encode(&(field_id, document_id, *value))
.map(Cow::into_owned)
.context("could not serialize facet key")?;
let result = match value {
String(s) => FieldDocIdFacetStringCodec::bytes_encode(&(field_id, document_id, s)).map(Cow::into_owned),
Number(f) => FieldDocIdFacetF64Codec::bytes_encode(&(field_id, document_id, **f)).map(Cow::into_owned),
};
if lmdb_key_valid_size(&key) {
sorter.insert(&key, &[])?;
}
Ok(())
}
fn write_field_id_docid_facet_string_value(
sorter: &mut Sorter<MergeFn>,
field_id: FieldId,
document_id: DocumentId,
value: &str,
) -> anyhow::Result<()>
{
let key = FieldDocIdFacetStringCodec::bytes_encode(&(field_id, document_id, value))
.map(Cow::into_owned)
.context("could not serialize facet key")?;
let key = result.context("could not serialize facet key")?;
if lmdb_key_valid_size(&key) {
sorter.insert(&key, &[])?;
}
@ -493,7 +583,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
let mut before = Instant::now();
let mut words_positions = HashMap::new();
let mut facet_values = HashMap::new();
let mut facet_numbers_values = HashMap::new();
let mut facet_strings_values = HashMap::new();
let mut count: usize = 0;
while let Some((key, value)) = documents.next()? {
@ -513,32 +604,12 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
}
for (attr, content) in document.iter() {
if self.faceted_fields.contains_key(&attr) || self.searchable_fields.contains(&attr) {
if self.faceted_fields.contains(&attr) || self.searchable_fields.contains(&attr) {
let value = serde_json::from_slice(content)?;
if let Some(ftype) = self.faceted_fields.get(&attr) {
let mut values = match parse_facet_value(*ftype, &value) {
Ok(values) => values,
Err(e) => {
// We extract the name of the attribute and the document id
// to help users debug a facet type conversion.
let attr_name = self.fields_ids_map.name(attr).unwrap();
let document_id: Value = self.fields_ids_map.id(&self.primary_key)
.and_then(|fid| document.get(fid))
.map(serde_json::from_slice)
.unwrap()?;
let context = format!(
"while extracting facet from the {:?} attribute in the {} document",
attr_name, document_id,
);
warn!("{}", e.context(context));
SmallVec8::default()
},
};
facet_values.entry(attr).or_insert_with(SmallVec8::new).extend(values.drain(..));
}
let (facet_numbers, facet_strings) = extract_facet_values(&value);
facet_numbers_values.entry(attr).or_insert_with(Vec::new).extend(facet_numbers);
facet_strings_values.entry(attr).or_insert_with(Vec::new).extend(facet_strings);
if self.searchable_fields.contains(&attr) {
let content = match json_to_string(&value) {
@ -558,7 +629,13 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
}
// We write the document in the documents store.
self.write_document(document_id, &mut words_positions, &mut facet_values, value)?;
self.write_document(
document_id,
&mut words_positions,
&mut facet_numbers_values,
&mut facet_strings_values,
value,
)?;
}
// Compute the document id of the next document.
@ -585,9 +662,14 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
&mut self.words_pairs_proximities_docids_sorter,
self.words_pairs_proximities_docids,
)?;
Self::write_facet_field_value_docids(
&mut self.facet_field_value_docids_sorter,
self.facet_field_value_docids,
Self::write_facet_field_number_docids(
&mut self.facet_field_numbers_docids_sorter,
self.facet_field_number_docids,
)?;
Self::write_facet_field_string_docids(
&mut self.facet_field_strings_docids_sorter,
self.facet_field_string_docids,
)?;
let mut word_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
@ -613,18 +695,26 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
let mut word_level_position_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?;
let mut facet_field_value_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.facet_field_value_docids_sorter.write_into(&mut facet_field_value_docids_wtr)?;
let mut facet_field_numbers_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.facet_field_numbers_docids_sorter.write_into(&mut facet_field_numbers_docids_wtr)?;
let mut field_id_docid_facet_values_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.field_id_docid_facet_values_sorter.write_into(&mut field_id_docid_facet_values_wtr)?;
let mut facet_field_strings_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.facet_field_strings_docids_sorter.write_into(&mut facet_field_strings_docids_wtr)?;
let mut field_id_docid_facet_numbers_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.field_id_docid_facet_numbers_sorter.write_into(&mut field_id_docid_facet_numbers_wtr)?;
let mut field_id_docid_facet_strings_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.field_id_docid_facet_strings_sorter.write_into(&mut field_id_docid_facet_strings_wtr)?;
let main = writer_into_reader(main_wtr, shrink_size)?;
let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?;
let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?;
let word_level_position_docids = writer_into_reader(word_level_position_docids_wtr, shrink_size)?;
let facet_field_value_docids = writer_into_reader(facet_field_value_docids_wtr, shrink_size)?;
let field_id_docid_facet_values = writer_into_reader(field_id_docid_facet_values_wtr, shrink_size)?;
let facet_field_numbers_docids = writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?;
let facet_field_strings_docids = writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?;
let field_id_docid_facet_numbers = writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?;
let field_id_docid_facet_strings = writer_into_reader(field_id_docid_facet_strings_wtr, shrink_size)?;
let docid_word_positions = writer_into_reader(self.docid_word_positions_writer, shrink_size)?;
let documents = writer_into_reader(self.documents_writer, shrink_size)?;
@ -634,8 +724,10 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
docid_word_positions,
words_pairs_proximities_docids,
word_level_position_docids,
facet_field_value_docids,
field_id_docid_facet_values,
facet_field_numbers_docids,
facet_field_strings_docids,
field_id_docid_facet_numbers,
field_id_docid_facet_strings,
documents,
})
}
@ -710,71 +802,35 @@ fn process_tokens<'a>(tokens: impl Iterator<Item = Token<'a>>) -> impl Iterator<
.filter(|(_, t)| t.is_word())
}
fn parse_facet_value(ftype: FacetType, value: &Value) -> anyhow::Result<SmallVec8<FacetValue>> {
use FacetValue::*;
fn inner_parse_facet_value(
ftype: FacetType,
fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<String>) {
fn inner_extract_facet_values(
value: &Value,
can_recurse: bool,
output: &mut SmallVec8<FacetValue>,
) -> anyhow::Result<()>
{
output_numbers: &mut Vec<f64>,
output_strings: &mut Vec<String>,
) {
match value {
Value::Null => Ok(()),
Value::Bool(b) => match ftype {
FacetType::String => {
output.push(String(b.to_string()));
Ok(())
},
FacetType::Number => {
output.push(Number(OrderedFloat(if *b { 1.0 } else { 0.0 })));
Ok(())
},
},
Value::Number(number) => match ftype {
FacetType::String => {
output.push(String(number.to_string()));
Ok(())
},
FacetType::Number => match number.as_f64() {
Some(float) => {
output.push(Number(OrderedFloat(float)));
Ok(())
},
None => bail!("invalid facet type, expecting {} found number", ftype),
},
Value::Null => (),
Value::Bool(b) => output_strings.push(b.to_string()),
Value::Number(number) => if let Some(float) = number.as_f64() {
output_numbers.push(float);
},
Value::String(string) => {
// TODO must be normalized and not only lowercased.
let string = string.trim().to_lowercase();
match ftype {
FacetType::String => {
output.push(String(string));
Ok(())
},
FacetType::Number => match string.parse() {
Ok(float) => {
output.push(Number(OrderedFloat(float)));
Ok(())
},
Err(_err) => bail!("invalid facet type, expecting {} found string", ftype),
},
}
output_strings.push(string);
},
Value::Array(values) => if can_recurse {
values.iter().map(|v| inner_parse_facet_value(ftype, v, false, output)).collect()
} else {
bail!(
"invalid facet type, expecting {} found array (recursive arrays are not supported)",
ftype,
);
for value in values {
inner_extract_facet_values(value, false, output_numbers, output_strings);
}
},
Value::Object(_) => bail!("invalid facet type, expecting {} found object", ftype),
Value::Object(_) => (),
}
}
let mut facet_values = SmallVec8::new();
inner_parse_facet_value(ftype, value, true, &mut facet_values)?;
Ok(facet_values)
let mut facet_number_values = Vec::new();
let mut facet_string_values = Vec::new();
inner_extract_facet_values(value, true, &mut facet_number_values, &mut facet_string_values);
(facet_number_values, facet_string_values)
}

View File

@ -1,5 +1,4 @@
use std::collections::{BTreeSet, HashMap};
use std::str::FromStr;
use std::collections::{BTreeSet, HashMap, HashSet};
use anyhow::Context;
use chrono::Utc;
@ -11,7 +10,6 @@ use serde::{Deserialize, Deserializer, Serialize, Serializer};
use crate::{FieldsIdsMap, Index};
use crate::criterion::Criterion;
use crate::facet::FacetType;
use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep};
use crate::update::index_documents::{IndexDocumentsMethod, Transform};
@ -68,7 +66,7 @@ pub struct Settings<'a, 't, 'u, 'i> {
searchable_fields: Setting<Vec<String>>,
displayed_fields: Setting<Vec<String>>,
faceted_fields: Setting<HashMap<String, String>>,
faceted_fields: Setting<HashSet<String>>,
criteria: Setting<Vec<String>>,
stop_words: Setting<BTreeSet<String>>,
distinct_attribute: Setting<String>,
@ -123,7 +121,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
self.faceted_fields = Setting::Reset;
}
pub fn set_faceted_fields(&mut self, names_facet_types: HashMap<String, String>) {
pub fn set_faceted_fields(&mut self, names_facet_types: HashSet<String>) {
self.faceted_fields = Setting::Set(names_facet_types);
}
@ -387,11 +385,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
match self.faceted_fields {
Setting::Set(ref fields) => {
let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
let mut new_facets = HashMap::new();
for (name, ty) in fields {
let mut new_facets = HashSet::new();
for name in fields {
fields_ids_map.insert(name).context("field id limit exceeded")?;
let ty = FacetType::from_str(&ty)?;
new_facets.insert(name.clone(), ty);
new_facets.insert(name.clone());
}
self.index.put_faceted_fields(self.wtxn, &new_facets)?;
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
@ -445,9 +442,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
#[cfg(test)]
mod tests {
use heed::EnvOpenOptions;
use maplit::{btreeset, hashmap};
use heed::types::ByteSlice;
use maplit::{btreeset, hashmap, hashset};
use big_s::S;
use crate::facet::FacetType;
use crate::update::{IndexDocuments, UpdateFormat};
use super::*;
@ -622,37 +620,53 @@ mod tests {
// Set the faceted fields to be the age.
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, 0);
builder.set_faceted_fields(hashmap!{ "age".into() => "number".into() });
builder.set_faceted_fields(hashset!{ S("age") });
builder.execute(|_, _| ()).unwrap();
// Then index some documents.
let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
let content = &br#"[
{ "name": "kevin", "age": 23 },
{ "name": "kevina", "age": 21 },
{ "name": "benoit", "age": 34 }
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.update_format(UpdateFormat::Json);
builder.enable_autogenerate_docids();
builder.update_format(UpdateFormat::Csv);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
// Check that the displayed fields are correctly set.
let rtxn = index.read_txn().unwrap();
let fields_ids = index.faceted_fields(&rtxn).unwrap();
assert_eq!(fields_ids, hashmap!{ "age".to_string() => FacetType::Number });
assert_eq!(fields_ids, hashset!{ S("age") });
// Only count the field_id 0 and level 0 facet values.
let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[0, 0]).unwrap().count();
// TODO we must support typed CSVs for numbers to be understood.
let count = index.facet_id_f64_docids
.remap_key_type::<ByteSlice>()
.prefix_iter(&rtxn, &[0, 0]).unwrap().count();
assert_eq!(count, 3);
drop(rtxn);
// Index a little more documents with new and current facets values.
let mut wtxn = index.write_txn().unwrap();
let content = &b"name,age\nkevin2,23\nkevina2,21\nbenoit2,35\n"[..];
let content = &br#"[
{ "name": "kevin2", "age": 23 },
{ "name": "kevina2", "age": 21 },
{ "name": "benoit", "age": 35 }
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 2);
builder.update_format(UpdateFormat::Csv);
builder.enable_autogenerate_docids();
builder.update_format(UpdateFormat::Json);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
let rtxn = index.read_txn().unwrap();
// Only count the field_id 0 and level 0 facet values.
let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[0, 0]).unwrap().count();
// TODO we must support typed CSVs for numbers to be understood.
let count = index.facet_id_f64_docids
.remap_key_type::<ByteSlice>()
.prefix_iter(&rtxn, &[0, 0]).unwrap().count();
assert_eq!(count, 4);
}
@ -819,10 +833,7 @@ mod tests {
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, 0);
builder.set_displayed_fields(vec!["hello".to_string()]);
builder.set_faceted_fields(hashmap!{
"age".into() => "number".into(),
"toto".into() => "number".into(),
});
builder.set_faceted_fields(hashset!{ S("age"), S("toto") });
builder.set_criteria(vec!["asc(toto)".to_string()]);
builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap();