diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index f57d6d54f..c80bb38f1 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -1,4 +1,3 @@ -use std::collections::HashMap; use std::mem::take; use anyhow::Context; @@ -7,11 +6,10 @@ use log::debug; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; -use crate::facet::FacetType; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; use crate::search::facet::FacetIter; use crate::search::query_tree::Operation; -use crate::{FieldsIdsMap, FieldId, Index}; +use crate::{FieldId, Index}; use super::{Criterion, CriterionParameters, CriterionResult}; /// Threshold on the number of candidates that will make @@ -119,7 +117,6 @@ impl<'t> Criterion for AscDesc<'t> { self.index, self.rtxn, self.field_id, - self.facet_type, self.ascending, candidates, )?; @@ -141,20 +138,6 @@ impl<'t> Criterion for AscDesc<'t> { } } -fn field_id_facet_type( - fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashMap, - field: &str, -) -> anyhow::Result<(FieldId, FacetType)> { - let id = fields_ids_map - .id(field) - .with_context(|| format!("field {:?} isn't registered", field))?; - let facet_type = faceted_fields - .get(field) - .with_context(|| format!("field {:?} isn't faceted", field))?; - Ok((id, *facet_type)) -} - /// Returns an iterator over groups of the given candidates in ascending or descending order. /// /// It will either use an iterative or a recursive method on the whole facet database depending diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index 7411e4af9..9485087d3 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -189,23 +189,21 @@ impl<'a> Distinct<'_> for FacetDistinct<'a> { #[cfg(test)] mod test { - use std::collections::HashMap; + use std::collections::HashSet; use super::super::test::{generate_index, validate_distinct_candidates}; use super::*; - use crate::facet::FacetType; macro_rules! test_facet_distinct { - ($name:ident, $distinct:literal, $facet_type:expr) => { + ($name:ident, $distinct:literal) => { #[test] fn $name() { use std::iter::FromIterator; - let facets = - HashMap::from_iter(Some(($distinct.to_string(), $facet_type.to_string()))); + let facets = HashSet::from_iter(Some(($distinct.to_string()))); let (index, fid, candidates) = generate_index($distinct, facets); let txn = index.read_txn().unwrap(); - let mut map_distinct = FacetDistinct::new(fid, &index, &txn, $facet_type); + let mut map_distinct = FacetDistinct::new(fid, &index, &txn); let excluded = RoaringBitmap::new(); let mut iter = map_distinct.distinct(candidates.clone(), excluded); let count = validate_distinct_candidates(iter.by_ref(), fid, &index); @@ -215,7 +213,7 @@ mod test { }; } - test_facet_distinct!(test_string, "txt", FacetType::String); - test_facet_distinct!(test_strings, "txts", FacetType::String); - test_facet_distinct!(test_number, "cat-int", FacetType::Number); + test_facet_distinct!(test_string, "txt"); + test_facet_distinct!(test_strings, "txts"); + test_facet_distinct!(test_number, "cat-int"); } diff --git a/milli/src/search/distinct/map_distinct.rs b/milli/src/search/distinct/map_distinct.rs index 4c01d1ded..465af2c3b 100644 --- a/milli/src/search/distinct/map_distinct.rs +++ b/milli/src/search/distinct/map_distinct.rs @@ -110,7 +110,7 @@ impl<'a, 'b> Distinct<'b> for MapDistinct<'a> { #[cfg(test)] mod test { - use std::collections::HashMap; + use std::collections::HashSet; use super::*; use super::super::test::{generate_index, validate_distinct_candidates}; @@ -119,7 +119,7 @@ mod test { ($name:ident, $distinct:literal) => { #[test] fn $name() { - let (index, fid, candidates) = generate_index($distinct, HashMap::new()); + let (index, fid, candidates) = generate_index($distinct, HashSet::new()); let txn = index.read_txn().unwrap(); let mut map_distinct = MapDistinct::new(fid, &index, &txn); let excluded = RoaringBitmap::new(); diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index 776f0d2b3..0dd628d5b 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -28,7 +28,7 @@ pub trait Distinct<'a> { #[cfg(test)] mod test { - use std::collections::{HashMap, HashSet}; + use std::collections::HashSet; use once_cell::sync::Lazy; use rand::{seq::SliceRandom, Rng}; @@ -74,7 +74,7 @@ mod test { /// Returns a temporary index populated with random test documents, the FieldId for the /// distinct attribute, and the RoaringBitmap with the document ids. - pub(crate) fn generate_index(distinct: &str, facets: HashMap) -> (TempIndex, FieldId, RoaringBitmap) { + pub(crate) fn generate_index(distinct: &str, facets: HashSet) -> (TempIndex, FieldId, RoaringBitmap) { let index = TempIndex::new(); let mut txn = index.write_txn().unwrap(); diff --git a/milli/src/search/facet/facet_condition.rs b/milli/src/search/facet/facet_condition.rs index b189f5f0f..e7917df97 100644 --- a/milli/src/search/facet/facet_condition.rs +++ b/milli/src/search/facet/facet_condition.rs @@ -240,7 +240,10 @@ impl FacetCondition { let value = items.next().unwrap(); let (result, svalue) = pest_parse(value); - Ok(Operator(fid, Equal(Some(result?), svalue))) + // TODO we must normalize instead of lowercase. + let svalue = svalue.to_lowercase(); + + Ok(Operator(fid, Equal(result.ok(), svalue))) } fn greater_than( @@ -473,7 +476,8 @@ mod tests { use super::*; use crate::update::Settings; use heed::EnvOpenOptions; - use maplit::hashmap; + use maplit::hashset; + use big_s::S; #[test] fn string() { @@ -485,22 +489,22 @@ mod tests { // Set the faceted fields to be the channel. let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_faceted_fields(hashmap!{ "channel".into() => "string".into() }); + builder.set_faceted_fields(hashset!{ S("channel") }); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); // Test that the facet condition is correctly generated. let rtxn = index.read_txn().unwrap(); - let condition = FacetCondition::from_str(&rtxn, &index, "channel = ponce").unwrap(); - let expected = OperatorString(0, FacetStringOperator::equal("Ponce")); + let condition = FacetCondition::from_str(&rtxn, &index, "channel = Ponce").unwrap(); + let expected = Operator(0, Operator::Equal(None, S("ponce"))); assert_eq!(condition, expected); let condition = FacetCondition::from_str(&rtxn, &index, "channel != ponce").unwrap(); - let expected = OperatorString(0, FacetStringOperator::not_equal("ponce")); + let expected = Operator(0, Operator::NotEqual(None, S("ponce"))); assert_eq!(condition, expected); let condition = FacetCondition::from_str(&rtxn, &index, "NOT channel = ponce").unwrap(); - let expected = OperatorString(0, FacetStringOperator::not_equal("ponce")); + let expected = Operator(0, Operator::NotEqual(None, S("ponce"))); assert_eq!(condition, expected); } @@ -514,20 +518,20 @@ mod tests { // Set the faceted fields to be the channel. let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_faceted_fields(hashmap!{ "timestamp".into() => "number".into() }); + builder.set_faceted_fields(hashset!{ "timestamp".into() }); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); // Test that the facet condition is correctly generated. let rtxn = index.read_txn().unwrap(); let condition = FacetCondition::from_str(&rtxn, &index, "timestamp 22 TO 44").unwrap(); - let expected = OperatorNumber(0, Between(22.0, 44.0)); + let expected = Operator(0, Between(22.0, 44.0)); assert_eq!(condition, expected); let condition = FacetCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap(); let expected = Or( - Box::new(OperatorNumber(0, LowerThan(22.0))), - Box::new(OperatorNumber(0, GreaterThan(44.0))), + Box::new(Operator(0, LowerThan(22.0))), + Box::new(Operator(0, GreaterThan(44.0))), ); assert_eq!(condition, expected); } @@ -542,11 +546,8 @@ mod tests { // Set the faceted fields to be the channel. let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_searchable_fields(vec!["channel".into(), "timestamp".into()]); // to keep the fields order - builder.set_faceted_fields(hashmap!{ - "channel".into() => "string".into(), - "timestamp".into() => "number".into(), - }); + builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order + builder.set_faceted_fields(hashset!{ S("channel"), S("timestamp") }); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -557,10 +558,10 @@ mod tests { "channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)", ).unwrap(); let expected = Or( - Box::new(OperatorString(0, FacetStringOperator::equal("gotaga"))), + Box::new(Operator(0, Operator::Equal(None, S("gotaga")))), Box::new(And( - Box::new(OperatorNumber(1, Between(22.0, 44.0))), - Box::new(OperatorString(0, FacetStringOperator::not_equal("ponce"))), + Box::new(Operator(1, Between(22.0, 44.0))), + Box::new(Operator(0, Operator::NotEqual(None, S("ponce")))), )) ); assert_eq!(condition, expected); @@ -570,13 +571,13 @@ mod tests { "channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)", ).unwrap(); let expected = Or( - Box::new(OperatorString(0, FacetStringOperator::equal("gotaga"))), + Box::new(Operator(0, Operator::Equal(None, S("gotaga")))), Box::new(Or( Box::new(Or( - Box::new(OperatorNumber(1, LowerThan(22.0))), - Box::new(OperatorNumber(1, GreaterThan(44.0))), + Box::new(Operator(1, LowerThan(22.0))), + Box::new(Operator(1, GreaterThan(44.0))), )), - Box::new(OperatorString(0, FacetStringOperator::equal("ponce"))), + Box::new(Operator(0, Operator::Equal(None, S("ponce")))), )), ); assert_eq!(condition, expected); @@ -592,11 +593,8 @@ mod tests { // Set the faceted fields to be the channel. let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_searchable_fields(vec!["channel".into(), "timestamp".into()]); // to keep the fields order - builder.set_faceted_fields(hashmap!{ - "channel".into() => "string".into(), - "timestamp".into() => "number".into(), - }); + builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order + builder.set_faceted_fields(hashset!{ S("channel"), S("timestamp") }); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -604,7 +602,7 @@ mod tests { let rtxn = index.read_txn().unwrap(); let condition = FacetCondition::from_array( &rtxn, &index, - vec![Either::Right("channel:gotaga"), Either::Left(vec!["timestamp:44", "channel:-ponce"])], + vec![Either::Right("channel = gotaga"), Either::Left(vec!["timestamp = 44", "channel != ponce"])], ).unwrap().unwrap(); let expected = FacetCondition::from_str( &rtxn, &index, diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index c163046ec..e4c1d35f8 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -118,8 +118,10 @@ mod tests { assert!(index.docid_word_positions.is_empty(&rtxn).unwrap()); assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap()); assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap()); - assert!(index.facet_field_id_value_docids.is_empty(&rtxn).unwrap()); - assert!(index.field_id_docid_facet_values.is_empty(&rtxn).unwrap()); + assert!(index.facet_id_f64_docids.is_empty(&rtxn).unwrap()); + assert!(index.facet_id_string_docids.is_empty(&rtxn).unwrap()); + assert!(index.field_id_docid_facet_f64s.is_empty(&rtxn).unwrap()); + assert!(index.field_id_docid_facet_strings.is_empty(&rtxn).unwrap()); assert!(index.documents.is_empty(&rtxn).unwrap()); } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 10c2e41e7..064f4e6fd 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -450,8 +450,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { .enumerate() .map(|(i, documents)| { let store = Store::new( - primary_key.clone(), - fields_ids_map.clone(), searchable_fields.clone(), faceted_fields.clone(), linked_hash_map_size, @@ -553,7 +551,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { docid_word_positions_readers, documents_readers, words_pairs_proximities_docids_readers, - facet_field_numbers_docids_readers, facet_field_strings_docids_readers, field_id_docid_facet_numbers_readers, field_id_docid_facet_strings_readers, @@ -565,7 +562,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { docid_word_positions_readers, documents_readers, words_pairs_proximities_docids_readers, - facet_field_numbers_docids_readers, facet_field_strings_docids_readers, field_id_docid_facet_numbers_readers, field_id_docid_facet_strings_readers, @@ -599,7 +595,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.index.put_documents_ids(self.wtxn, &documents_ids)?; let mut database_count = 0; - let total_databases = 8; + let total_databases = 10; progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { databases_seen: 0, @@ -636,6 +632,21 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { total_databases, }); + debug!("Writing the facet id string docids into LMDB on disk..."); + merge_into_lmdb_database( + self.wtxn, + *self.index.facet_id_string_docids.as_polymorph(), + facet_field_strings_docids_readers, + facet_field_value_docids_merge, + write_method, + )?; + + database_count += 1; + progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen: database_count, + total_databases, + }); + debug!("Writing the field id docid facet numbers into LMDB on disk..."); merge_into_lmdb_database( self.wtxn, diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index ba8da6d16..afc199293 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -23,7 +23,7 @@ use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec}; use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec}; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; use crate::update::UpdateIndexingStep; -use crate::{json_to_string, SmallVec32, Position, DocumentId, FieldId, FieldsIdsMap}; +use crate::{json_to_string, SmallVec32, Position, DocumentId, FieldId}; use super::{MergeFn, create_writer, create_sorter, writer_into_reader}; use super::merge_function::{ @@ -53,8 +53,6 @@ pub struct Readers { pub struct Store<'s, A> { // Indexing parameters - primary_key: String, - fields_ids_map: FieldsIdsMap, searchable_fields: HashSet, faceted_fields: HashSet, // Caches @@ -87,8 +85,6 @@ pub struct Store<'s, A> { impl<'s, A: AsRef<[u8]>> Store<'s, A> { pub fn new( - primary_key: String, - fields_ids_map: FieldsIdsMap, searchable_fields: HashSet, faceted_fields: HashSet, linked_hash_map_size: Option, @@ -184,8 +180,6 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Ok(Store { // Indexing parameters. - primary_key, - fields_ids_map, searchable_fields, faceted_fields, // Caches diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 79c447834..1571f627d 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1,5 +1,4 @@ use std::collections::{BTreeSet, HashMap, HashSet}; -use std::str::FromStr; use anyhow::Context; use chrono::Utc; @@ -443,9 +442,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { #[cfg(test)] mod tests { use heed::EnvOpenOptions; - use maplit::{btreeset, hashmap}; + use heed::types::ByteSlice; + use maplit::{btreeset, hashmap, hashset}; + use big_s::S; - use crate::facet::FacetType; use crate::update::{IndexDocuments, UpdateFormat}; use super::*; @@ -620,37 +620,53 @@ mod tests { // Set the faceted fields to be the age. let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_faceted_fields(hashmap!{ "age".into() => "number".into() }); + builder.set_faceted_fields(hashset!{ S("age") }); builder.execute(|_, _| ()).unwrap(); // Then index some documents. - let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; + let content = &br#"[ + { "name": "kevin", "age": 23 }, + { "name": "kevina", "age": 21 }, + { "name": "benoit", "age": 34 } + ]"#[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); + builder.update_format(UpdateFormat::Json); builder.enable_autogenerate_docids(); - builder.update_format(UpdateFormat::Csv); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); // Check that the displayed fields are correctly set. let rtxn = index.read_txn().unwrap(); let fields_ids = index.faceted_fields(&rtxn).unwrap(); - assert_eq!(fields_ids, hashmap!{ "age".to_string() => FacetType::Number }); + assert_eq!(fields_ids, hashset!{ S("age") }); // Only count the field_id 0 and level 0 facet values. - let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[0, 0]).unwrap().count(); + // TODO we must support typed CSVs for numbers to be understood. + let count = index.facet_id_f64_docids + .remap_key_type::() + .prefix_iter(&rtxn, &[0, 0]).unwrap().count(); assert_eq!(count, 3); drop(rtxn); // Index a little more documents with new and current facets values. let mut wtxn = index.write_txn().unwrap(); - let content = &b"name,age\nkevin2,23\nkevina2,21\nbenoit2,35\n"[..]; + let content = &br#"[ + { "name": "kevin2", "age": 23 }, + { "name": "kevina2", "age": 21 }, + { "name": "benoit", "age": 35 } + ]"#[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index, 2); - builder.update_format(UpdateFormat::Csv); + builder.enable_autogenerate_docids(); + builder.update_format(UpdateFormat::Json); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); // Only count the field_id 0 and level 0 facet values. - let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[0, 0]).unwrap().count(); + // TODO we must support typed CSVs for numbers to be understood. + let count = index.facet_id_f64_docids + .remap_key_type::() + .prefix_iter(&rtxn, &[0, 0]).unwrap().count(); assert_eq!(count, 4); } @@ -817,10 +833,7 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_displayed_fields(vec!["hello".to_string()]); - builder.set_faceted_fields(hashmap!{ - "age".into() => "number".into(), - "toto".into() => "number".into(), - }); + builder.set_faceted_fields(hashset!{ S("age"), S("toto") }); builder.set_criteria(vec!["asc(toto)".to_string()]); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap();