Implementing an IS EMPTY filter

This commit is contained in:
Clément Renault 2023-03-14 18:08:12 +01:00
parent fa2ea4a379
commit ea016d97af
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
12 changed files with 156 additions and 37 deletions

View File

@ -21,6 +21,7 @@ pub enum Condition<'a> {
Equal(Token<'a>), Equal(Token<'a>),
NotEqual(Token<'a>), NotEqual(Token<'a>),
Null, Null,
Empty,
Exists, Exists,
LowerThan(Token<'a>), LowerThan(Token<'a>),
LowerThanOrEqual(Token<'a>), LowerThanOrEqual(Token<'a>),
@ -61,6 +62,22 @@ pub fn parse_is_not_null(input: Span) -> IResult<FilterCondition> {
Ok((input, FilterCondition::Not(Box::new(FilterCondition::Condition { fid: key, op: Null })))) Ok((input, FilterCondition::Not(Box::new(FilterCondition::Condition { fid: key, op: Null }))))
} }
/// empty = value "IS" WS+ "EMPTY"
pub fn parse_is_empty(input: Span) -> IResult<FilterCondition> {
let (input, key) = parse_value(input)?;
let (input, _) = tuple((tag("IS"), multispace1, tag("EMPTY")))(input)?;
Ok((input, FilterCondition::Condition { fid: key, op: Empty }))
}
/// empty = value "IS" WS+ "NOT" WS+ "EMPTY"
pub fn parse_is_not_empty(input: Span) -> IResult<FilterCondition> {
let (input, key) = parse_value(input)?;
let (input, _) = tuple((tag("IS"), multispace1, tag("NOT"), multispace1, tag("EMPTY")))(input)?;
Ok((input, FilterCondition::Not(Box::new(FilterCondition::Condition { fid: key, op: Empty }))))
}
/// exist = value "EXISTS" /// exist = value "EXISTS"
pub fn parse_exists(input: Span) -> IResult<FilterCondition> { pub fn parse_exists(input: Span) -> IResult<FilterCondition> {
let (input, key) = terminated(parse_value, tag("EXISTS"))(input)?; let (input, key) = terminated(parse_value, tag("EXISTS"))(input)?;

View File

@ -143,11 +143,9 @@ impl<'a> Display for Error<'a> {
ErrorKind::MissingClosingDelimiter(c) => { ErrorKind::MissingClosingDelimiter(c) => {
writeln!(f, "Expression `{}` is missing the following closing delimiter: `{}`.", escaped_input, c)? writeln!(f, "Expression `{}` is missing the following closing delimiter: `{}`.", escaped_input, c)?
} }
ErrorKind::InvalidPrimary if input.trim().is_empty() => {
writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `_geoRadius`, or `_geoBoundingBox` but instead got nothing.")?
}
ErrorKind::InvalidPrimary => { ErrorKind::InvalidPrimary => {
writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `_geoRadius`, or `_geoBoundingBox` at `{}`.", escaped_input)? let text = if input.trim().is_empty() { "but instead got nothing.".to_string() } else { format!("at `{}`.", escaped_input) };
writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` {}", text)?
} }
ErrorKind::ExpectedEof => { ErrorKind::ExpectedEof => {
writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", escaped_input)? writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", escaped_input)?

View File

@ -47,7 +47,10 @@ mod value;
use std::fmt::Debug; use std::fmt::Debug;
pub use condition::{parse_condition, parse_to, Condition}; pub use condition::{parse_condition, parse_to, Condition};
use condition::{parse_exists, parse_is_not_null, parse_is_null, parse_not_exists}; use condition::{
parse_exists, parse_is_empty, parse_is_not_empty, parse_is_not_null, parse_is_null,
parse_not_exists,
};
use error::{cut_with_err, ExpectedValueKind, NomErrorExt}; use error::{cut_with_err, ExpectedValueKind, NomErrorExt};
pub use error::{Error, ErrorKind}; pub use error::{Error, ErrorKind};
use nom::branch::alt; use nom::branch::alt;
@ -416,6 +419,8 @@ fn parse_primary(input: Span, depth: usize) -> IResult<FilterCondition> {
parse_condition, parse_condition,
parse_is_null, parse_is_null,
parse_is_not_null, parse_is_not_null,
parse_is_empty,
parse_is_not_empty,
parse_exists, parse_exists,
parse_not_exists, parse_not_exists,
parse_to, parse_to,
@ -509,6 +514,13 @@ pub mod tests {
insta::assert_display_snapshot!(p("NOT subscribers IS NOT NULL"), @"{subscribers} IS NULL"); insta::assert_display_snapshot!(p("NOT subscribers IS NOT NULL"), @"{subscribers} IS NULL");
insta::assert_display_snapshot!(p("subscribers IS NOT NULL"), @"NOT ({subscribers} IS NULL)"); insta::assert_display_snapshot!(p("subscribers IS NOT NULL"), @"NOT ({subscribers} IS NULL)");
// Test EMPTY + NOT EMPTY
insta::assert_display_snapshot!(p("subscribers IS EMPTY"), @"{subscribers} IS EMPTY");
insta::assert_display_snapshot!(p("NOT subscribers IS EMPTY"), @"NOT ({subscribers} IS EMPTY)");
insta::assert_display_snapshot!(p("subscribers IS NOT EMPTY"), @"NOT ({subscribers} IS EMPTY)");
insta::assert_display_snapshot!(p("NOT subscribers IS NOT EMPTY"), @"{subscribers} IS EMPTY");
insta::assert_display_snapshot!(p("subscribers IS NOT EMPTY"), @"NOT ({subscribers} IS EMPTY)");
// Test EXISTS + NOT EXITS // Test EXISTS + NOT EXITS
insta::assert_display_snapshot!(p("subscribers EXISTS"), @"{subscribers} EXISTS"); insta::assert_display_snapshot!(p("subscribers EXISTS"), @"{subscribers} EXISTS");
insta::assert_display_snapshot!(p("NOT subscribers EXISTS"), @"NOT ({subscribers} EXISTS)"); insta::assert_display_snapshot!(p("NOT subscribers EXISTS"), @"NOT ({subscribers} EXISTS)");
@ -587,7 +599,7 @@ pub mod tests {
"###); "###);
insta::assert_display_snapshot!(p("'OR'"), @r###" insta::assert_display_snapshot!(p("'OR'"), @r###"
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `_geoRadius`, or `_geoBoundingBox` at `\'OR\'`. Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `\'OR\'`.
1:5 'OR' 1:5 'OR'
"###); "###);
@ -597,12 +609,12 @@ pub mod tests {
"###); "###);
insta::assert_display_snapshot!(p("channel Ponce"), @r###" insta::assert_display_snapshot!(p("channel Ponce"), @r###"
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `_geoRadius`, or `_geoBoundingBox` at `channel Ponce`. Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `channel Ponce`.
1:14 channel Ponce 1:14 channel Ponce
"###); "###);
insta::assert_display_snapshot!(p("channel = Ponce OR"), @r###" insta::assert_display_snapshot!(p("channel = Ponce OR"), @r###"
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `_geoRadius`, or `_geoBoundingBox` but instead got nothing. Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` but instead got nothing.
19:19 channel = Ponce OR 19:19 channel = Ponce OR
"###); "###);
@ -667,12 +679,12 @@ pub mod tests {
"###); "###);
insta::assert_display_snapshot!(p("colour NOT EXIST"), @r###" insta::assert_display_snapshot!(p("colour NOT EXIST"), @r###"
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `_geoRadius`, or `_geoBoundingBox` at `colour NOT EXIST`. Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `colour NOT EXIST`.
1:17 colour NOT EXIST 1:17 colour NOT EXIST
"###); "###);
insta::assert_display_snapshot!(p("subscribers 100 TO1000"), @r###" insta::assert_display_snapshot!(p("subscribers 100 TO1000"), @r###"
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `_geoRadius`, or `_geoBoundingBox` at `subscribers 100 TO1000`. Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `subscribers 100 TO1000`.
1:23 subscribers 100 TO1000 1:23 subscribers 100 TO1000
"###); "###);
@ -812,6 +824,7 @@ impl<'a> std::fmt::Display for Condition<'a> {
Condition::Equal(token) => write!(f, "= {token}"), Condition::Equal(token) => write!(f, "= {token}"),
Condition::NotEqual(token) => write!(f, "!= {token}"), Condition::NotEqual(token) => write!(f, "!= {token}"),
Condition::Null => write!(f, "IS NULL"), Condition::Null => write!(f, "IS NULL"),
Condition::Empty => write!(f, "IS EMPTY"),
Condition::Exists => write!(f, "EXISTS"), Condition::Exists => write!(f, "EXISTS"),
Condition::LowerThan(token) => write!(f, "< {token}"), Condition::LowerThan(token) => write!(f, "< {token}"),
Condition::LowerThanOrEqual(token) => write!(f, "<= {token}"), Condition::LowerThanOrEqual(token) => write!(f, "<= {token}"),

View File

@ -188,6 +188,7 @@ fn is_keyword(s: &str) -> bool {
| "EXISTS" | "EXISTS"
| "IS" | "IS"
| "NULL" | "NULL"
| "EMPTY"
| "_geoRadius" | "_geoRadius"
| "_geoBoundingBox" | "_geoBoundingBox"
) )

View File

@ -547,7 +547,7 @@ async fn filter_invalid_syntax_object() {
index.wait_task(1).await; index.wait_task(1).await;
let expected_response = json!({ let expected_response = json!({
"message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `_geoRadius`, or `_geoBoundingBox` at `title & Glass`.\n1:14 title & Glass", "message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `title & Glass`.\n1:14 title & Glass",
"code": "invalid_search_filter", "code": "invalid_search_filter",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_filter" "link": "https://docs.meilisearch.com/errors#invalid_search_filter"
@ -572,7 +572,7 @@ async fn filter_invalid_syntax_array() {
index.wait_task(1).await; index.wait_task(1).await;
let expected_response = json!({ let expected_response = json!({
"message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `_geoRadius`, or `_geoBoundingBox` at `title & Glass`.\n1:14 title & Glass", "message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `title & Glass`.\n1:14 title & Glass",
"code": "invalid_search_filter", "code": "invalid_search_filter",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_filter" "link": "https://docs.meilisearch.com/errors#invalid_search_filter"

View File

@ -81,6 +81,7 @@ pub mod db_name {
pub const FACET_ID_F64_DOCIDS: &str = "facet-id-f64-docids"; pub const FACET_ID_F64_DOCIDS: &str = "facet-id-f64-docids";
pub const FACET_ID_EXISTS_DOCIDS: &str = "facet-id-exists-docids"; pub const FACET_ID_EXISTS_DOCIDS: &str = "facet-id-exists-docids";
pub const FACET_ID_IS_NULL_DOCIDS: &str = "facet-id-is-null-docids"; pub const FACET_ID_IS_NULL_DOCIDS: &str = "facet-id-is-null-docids";
pub const FACET_ID_IS_EMPTY_DOCIDS: &str = "facet-id-is-empty-docids";
pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids"; pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids";
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s"; pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings"; pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
@ -130,9 +131,10 @@ pub struct Index {
/// Maps the facet field id and the docids for which this field exists /// Maps the facet field id and the docids for which this field exists
pub facet_id_exists_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>, pub facet_id_exists_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>,
/// Maps the facet field id and the docids for which this field is set as null /// Maps the facet field id and the docids for which this field is set as null
pub facet_id_is_null_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>, pub facet_id_is_null_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>,
/// Maps the facet field id and the docids for which this field is considered empty
pub facet_id_is_empty_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>,
/// Maps the facet field id and ranges of numbers with the docids that corresponds to them. /// Maps the facet field id and ranges of numbers with the docids that corresponds to them.
pub facet_id_f64_docids: Database<FacetGroupKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>, pub facet_id_f64_docids: Database<FacetGroupKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>,
@ -157,7 +159,7 @@ impl Index {
) -> Result<Index> { ) -> Result<Index> {
use db_name::*; use db_name::*;
options.max_dbs(20); options.max_dbs(21);
unsafe { options.flag(Flags::MdbAlwaysFreePages) }; unsafe { options.flag(Flags::MdbAlwaysFreePages) };
let env = options.open(path)?; let env = options.open(path)?;
@ -180,6 +182,7 @@ impl Index {
let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?; let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?;
let facet_id_exists_docids = env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?; let facet_id_exists_docids = env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?;
let facet_id_is_null_docids = env.create_database(Some(FACET_ID_IS_NULL_DOCIDS))?; let facet_id_is_null_docids = env.create_database(Some(FACET_ID_IS_NULL_DOCIDS))?;
let facet_id_is_empty_docids = env.create_database(Some(FACET_ID_IS_EMPTY_DOCIDS))?;
let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?; let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?;
let field_id_docid_facet_strings = let field_id_docid_facet_strings =
@ -207,6 +210,7 @@ impl Index {
facet_id_string_docids, facet_id_string_docids,
facet_id_exists_docids, facet_id_exists_docids,
facet_id_is_null_docids, facet_id_is_null_docids,
facet_id_is_empty_docids,
field_id_docid_facet_f64s, field_id_docid_facet_f64s,
field_id_docid_facet_strings, field_id_docid_facet_strings,
documents, documents,
@ -851,6 +855,18 @@ impl Index {
} }
} }
/// Retrieve all the documents which contain this field id and that is considered empty
pub fn empty_faceted_documents_ids(
&self,
rtxn: &RoTxn,
field_id: FieldId,
) -> heed::Result<RoaringBitmap> {
match self.facet_id_is_empty_docids.get(rtxn, &BEU16::new(field_id))? {
Some(docids) => Ok(docids),
None => Ok(RoaringBitmap::new()),
}
}
/// Retrieve all the documents which contain this field id /// Retrieve all the documents which contain this field id
pub fn exists_faceted_documents_ids( pub fn exists_faceted_documents_ids(
&self, &self,

View File

@ -223,6 +223,10 @@ impl<'a> Filter<'a> {
let is_null = index.null_faceted_documents_ids(rtxn, field_id)?; let is_null = index.null_faceted_documents_ids(rtxn, field_id)?;
return Ok(is_null); return Ok(is_null);
} }
Condition::Empty => {
let is_empty = index.empty_faceted_documents_ids(rtxn, field_id)?;
return Ok(is_empty);
}
Condition::Exists => { Condition::Exists => {
let exist = index.exists_faceted_documents_ids(rtxn, field_id)?; let exist = index.exists_faceted_documents_ids(rtxn, field_id)?;
return Ok(exist); return Ok(exist);

View File

@ -35,6 +35,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
facet_id_string_docids, facet_id_string_docids,
facet_id_exists_docids, facet_id_exists_docids,
facet_id_is_null_docids, facet_id_is_null_docids,
facet_id_is_empty_docids,
field_id_docid_facet_f64s, field_id_docid_facet_f64s,
field_id_docid_facet_strings, field_id_docid_facet_strings,
documents, documents,
@ -88,6 +89,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
facet_id_f64_docids.clear(self.wtxn)?; facet_id_f64_docids.clear(self.wtxn)?;
facet_id_exists_docids.clear(self.wtxn)?; facet_id_exists_docids.clear(self.wtxn)?;
facet_id_is_null_docids.clear(self.wtxn)?; facet_id_is_null_docids.clear(self.wtxn)?;
facet_id_is_empty_docids.clear(self.wtxn)?;
facet_id_string_docids.clear(self.wtxn)?; facet_id_string_docids.clear(self.wtxn)?;
field_id_docid_facet_f64s.clear(self.wtxn)?; field_id_docid_facet_f64s.clear(self.wtxn)?;
field_id_docid_facet_strings.clear(self.wtxn)?; field_id_docid_facet_strings.clear(self.wtxn)?;

View File

@ -246,6 +246,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
script_language_docids, script_language_docids,
facet_id_exists_docids, facet_id_exists_docids,
facet_id_is_null_docids, facet_id_is_null_docids,
facet_id_is_empty_docids,
documents, documents,
} = self.index; } = self.index;
@ -531,6 +532,13 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
&self.to_delete_docids, &self.to_delete_docids,
)?; )?;
// We delete the documents ids that are under the facet field id values.
remove_docids_from_facet_id_docids(
self.wtxn,
facet_id_is_empty_docids,
&self.to_delete_docids,
)?;
self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?; self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?;
Ok(DetailedDocumentDeletionResult { Ok(DetailedDocumentDeletionResult {

View File

@ -21,6 +21,7 @@ pub struct ExtractedFacetValues {
pub docid_fid_facet_numbers_chunk: grenad::Reader<File>, pub docid_fid_facet_numbers_chunk: grenad::Reader<File>,
pub docid_fid_facet_strings_chunk: grenad::Reader<File>, pub docid_fid_facet_strings_chunk: grenad::Reader<File>,
pub fid_facet_is_null_docids_chunk: grenad::Reader<File>, pub fid_facet_is_null_docids_chunk: grenad::Reader<File>,
pub fid_facet_is_empty_docids_chunk: grenad::Reader<File>,
pub fid_facet_exists_docids_chunk: grenad::Reader<File>, pub fid_facet_exists_docids_chunk: grenad::Reader<File>,
} }
@ -56,6 +57,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
let mut facet_exists_docids = BTreeMap::<FieldId, RoaringBitmap>::new(); let mut facet_exists_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
let mut facet_is_null_docids = BTreeMap::<FieldId, RoaringBitmap>::new(); let mut facet_is_null_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
let mut facet_is_empty_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
let mut key_buffer = Vec::new(); let mut key_buffer = Vec::new();
let mut cursor = obkv_documents.into_cursor()?; let mut cursor = obkv_documents.into_cursor()?;
@ -80,10 +82,14 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
key_buffer.extend_from_slice(docid_bytes); key_buffer.extend_from_slice(docid_bytes);
let value = from_slice(field_bytes).map_err(InternalError::SerdeJson)?; let value = from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
match extract_facet_values(&value) { match extract_facet_values(&value) {
FilterableValues::Null => { FilterableValues::Null => {
facet_is_null_docids.entry(field_id).or_default().insert(document); facet_is_null_docids.entry(field_id).or_default().insert(document);
} }
FilterableValues::Empty => {
facet_is_empty_docids.entry(field_id).or_default().insert(document);
}
FilterableValues::Values { numbers, strings } => { FilterableValues::Values { numbers, strings } => {
// insert facet numbers in sorter // insert facet numbers in sorter
for number in numbers { for number in numbers {
@ -140,22 +146,34 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
} }
let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?; let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?;
let mut facet_is_empty_docids_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
for (fid, bitmap) in facet_is_empty_docids.into_iter() {
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
}
let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?;
Ok(ExtractedFacetValues { Ok(ExtractedFacetValues {
docid_fid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?, docid_fid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?,
docid_fid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?, docid_fid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
fid_facet_is_null_docids_chunk: facet_is_null_docids_reader, fid_facet_is_null_docids_chunk: facet_is_null_docids_reader,
fid_facet_is_empty_docids_chunk: facet_is_empty_docids_reader,
fid_facet_exists_docids_chunk: facet_exists_docids_reader, fid_facet_exists_docids_chunk: facet_exists_docids_reader,
}) })
} }
/// Represent what a document field contains. /// Represent what a document field contains.
enum FilterableValues { enum FilterableValues {
/// Corresponds to the JSON `null` value.
Null, Null,
/// Corresponds to either, an empty string `""`, an empty array `[]`, or an empty object `{}`.
Empty,
/// Represents all the numbers and strings values found in this document field. /// Represents all the numbers and strings values found in this document field.
Values { Values { numbers: Vec<f64>, strings: Vec<(String, String)> },
numbers: Vec<f64>,
strings: Vec<(String, String)>,
},
} }
fn extract_facet_values(value: &Value) -> FilterableValues { fn extract_facet_values(value: &Value) -> FilterableValues {
@ -192,6 +210,9 @@ fn extract_facet_values(value: &Value) -> FilterableValues {
match value { match value {
Value::Null => FilterableValues::Null, Value::Null => FilterableValues::Null,
Value::String(s) if s.is_empty() => FilterableValues::Empty,
Value::Array(a) if a.is_empty() => FilterableValues::Empty,
Value::Object(o) if o.is_empty() => FilterableValues::Empty,
otherwise => { otherwise => {
let mut numbers = Vec::new(); let mut numbers = Vec::new();
let mut strings = Vec::new(); let mut strings = Vec::new();

View File

@ -55,22 +55,23 @@ pub(crate) fn data_from_obkv_documents(
.collect::<Result<()>>()?; .collect::<Result<()>>()?;
#[allow(clippy::type_complexity)] #[allow(clippy::type_complexity)]
let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, (Vec<_>, Vec<_>))))> = flattened_obkv_chunks let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, (Vec<_>, (Vec<_>, Vec<_>)))))> =
.par_bridge() flattened_obkv_chunks
.map(|flattened_obkv_chunks| { .par_bridge()
send_and_extract_flattened_documents_data( .map(|flattened_obkv_chunks| {
flattened_obkv_chunks, send_and_extract_flattened_documents_data(
indexer, flattened_obkv_chunks,
lmdb_writer_sx.clone(), indexer,
&searchable_fields, lmdb_writer_sx.clone(),
&faceted_fields, &searchable_fields,
primary_key_id, &faceted_fields,
geo_fields_ids, primary_key_id,
&stop_words, geo_fields_ids,
max_positions_per_attributes, &stop_words,
) max_positions_per_attributes,
}) )
.collect(); })
.collect();
let ( let (
docid_word_positions_chunks, docid_word_positions_chunks,
@ -78,7 +79,10 @@ pub(crate) fn data_from_obkv_documents(
docid_fid_facet_numbers_chunks, docid_fid_facet_numbers_chunks,
( (
docid_fid_facet_strings_chunks, docid_fid_facet_strings_chunks,
(facet_is_null_docids_chunks, facet_exists_docids_chunks), (
facet_is_null_docids_chunks,
(facet_is_empty_docids_chunks, facet_exists_docids_chunks),
),
), ),
), ),
) = result?; ) = result?;
@ -115,6 +119,22 @@ pub(crate) fn data_from_obkv_documents(
}); });
} }
// merge facet_is_empty_docids and send them as a typed chunk
{
let lmdb_writer_sx = lmdb_writer_sx.clone();
rayon::spawn(move || {
debug!("merge {} database", "facet-id-is-empty-docids");
match facet_is_empty_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) {
Ok(reader) => {
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(reader)));
}
Err(e) => {
let _ = lmdb_writer_sx.send(Err(e));
}
}
});
}
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>( spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
docid_word_positions_chunks.clone(), docid_word_positions_chunks.clone(),
indexer, indexer,
@ -254,7 +274,10 @@ fn send_and_extract_flattened_documents_data(
grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>,
( (
grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>,
(grenad::Reader<CursorClonableMmap>, (grenad::Reader<File>, grenad::Reader<File>)), (
grenad::Reader<CursorClonableMmap>,
(grenad::Reader<File>, (grenad::Reader<File>, grenad::Reader<File>)),
),
), ),
)> { )> {
let flattened_documents_chunk = let flattened_documents_chunk =
@ -304,6 +327,7 @@ fn send_and_extract_flattened_documents_data(
docid_fid_facet_numbers_chunk, docid_fid_facet_numbers_chunk,
docid_fid_facet_strings_chunk, docid_fid_facet_strings_chunk,
fid_facet_is_null_docids_chunk, fid_facet_is_null_docids_chunk,
fid_facet_is_empty_docids_chunk,
fid_facet_exists_docids_chunk, fid_facet_exists_docids_chunk,
} = extract_fid_docid_facet_values( } = extract_fid_docid_facet_values(
flattened_documents_chunk.clone(), flattened_documents_chunk.clone(),
@ -331,7 +355,10 @@ fn send_and_extract_flattened_documents_data(
docid_fid_facet_numbers_chunk, docid_fid_facet_numbers_chunk,
( (
docid_fid_facet_strings_chunk, docid_fid_facet_strings_chunk,
(fid_facet_is_null_docids_chunk, fid_facet_exists_docids_chunk), (
fid_facet_is_null_docids_chunk,
(fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk),
),
), ),
)) ))
}, },

View File

@ -40,6 +40,7 @@ pub(crate) enum TypedChunk {
FieldIdFacetNumberDocids(grenad::Reader<File>), FieldIdFacetNumberDocids(grenad::Reader<File>),
FieldIdFacetExistsDocids(grenad::Reader<File>), FieldIdFacetExistsDocids(grenad::Reader<File>),
FieldIdFacetIsNullDocids(grenad::Reader<File>), FieldIdFacetIsNullDocids(grenad::Reader<File>),
FieldIdFacetIsEmptyDocids(grenad::Reader<File>),
GeoPoints(grenad::Reader<File>), GeoPoints(grenad::Reader<File>),
ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>), ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>),
} }
@ -173,6 +174,17 @@ pub(crate) fn write_typed_chunk_into_index(
)?; )?;
is_merged_database = true; is_merged_database = true;
} }
TypedChunk::FieldIdFacetIsEmptyDocids(facet_id_is_empty_docids) => {
append_entries_into_database(
facet_id_is_empty_docids,
&index.facet_id_is_empty_docids,
wtxn,
index_is_empty,
|value, _buffer| Ok(value),
merge_cbo_roaring_bitmaps,
)?;
is_merged_database = true;
}
TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => { TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => {
append_entries_into_database( append_entries_into_database(
word_pair_proximity_docids_iter, word_pair_proximity_docids_iter,