2020-10-25 21:49:04 +08:00
use std ::borrow ::Cow ;
2023-07-26 15:33:42 +08:00
use std ::collections ::{ BTreeMap , BTreeSet , HashMap , HashSet } ;
2024-05-06 20:49:45 +08:00
use std ::convert ::TryInto ;
2022-06-22 22:23:11 +08:00
use std ::fs ::File ;
2020-10-30 17:56:35 +08:00
use std ::path ::Path ;
2020-10-25 21:49:04 +08:00
2022-10-15 03:05:53 +08:00
use charabia ::{ Language , Script } ;
2020-10-21 21:55:48 +08:00
use heed ::types ::* ;
2023-11-23 01:21:19 +08:00
use heed ::{ CompactionOption , Database , RoTxn , RwTxn , Unspecified } ;
2020-10-21 21:55:48 +08:00
use roaring ::RoaringBitmap ;
2021-08-23 22:32:11 +08:00
use rstar ::RTree ;
2024-05-30 17:50:30 +08:00
use serde ::{ Deserialize , Serialize } ;
2022-02-15 18:41:55 +08:00
use time ::OffsetDateTime ;
2020-10-21 21:55:48 +08:00
2023-11-09 21:22:43 +08:00
use crate ::documents ::PrimaryKey ;
2021-07-22 23:11:17 +08:00
use crate ::error ::{ InternalError , UserError } ;
2021-06-17 00:33:33 +08:00
use crate ::fields_ids_map ::FieldsIdsMap ;
2022-09-05 23:31:26 +08:00
use crate ::heed_codec ::facet ::{
FacetGroupKeyCodec , FacetGroupValueCodec , FieldDocIdFacetF64Codec , FieldDocIdFacetStringCodec ,
2022-10-12 15:42:55 +08:00
FieldIdCodec , OrderedF64Codec ,
2022-09-05 23:31:26 +08:00
} ;
2023-07-20 23:57:07 +08:00
use crate ::heed_codec ::{
BEU16StrCodec , FstSetCodec , ScriptLanguageCodec , StrBEU16Codec , StrRefCodec ,
} ;
2024-03-12 18:01:46 +08:00
use crate ::order_by_map ::OrderByMap ;
2023-12-06 22:49:02 +08:00
use crate ::proximity ::ProximityPrecision ;
2024-05-21 23:08:45 +08:00
use crate ::vector ::parsed_vectors ::RESERVED_VECTORS_FIELD_NAME ;
2024-05-22 18:26:00 +08:00
use crate ::vector ::{ Embedding , EmbeddingConfig } ;
2021-06-17 00:33:33 +08:00
use crate ::{
2023-06-07 16:02:21 +08:00
default_criteria , CboRoaringBitmapCodec , Criterion , DocumentId , ExternalDocumentsIds ,
2024-05-15 21:02:26 +08:00
FacetDistribution , FieldDistribution , FieldId , FieldIdMapMissingEntry , FieldIdWordCountCodec ,
FieldidsWeightsMap , GeoPoint , ObkvCodec , Result , RoaringBitmapCodec , RoaringBitmapLenCodec ,
Search , U8StrStrCodec , Weight , BEU16 , BEU32 , BEU64 ,
2021-04-21 17:49:26 +08:00
} ;
2020-10-21 21:55:48 +08:00
2022-04-01 00:23:12 +08:00
pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO : u8 = 5 ;
pub const DEFAULT_MIN_WORD_LEN_TWO_TYPOS : u8 = 9 ;
2022-03-21 20:03:06 +08:00
2021-06-15 17:06:42 +08:00
pub mod main_key {
pub const CRITERIA_KEY : & str = " criteria " ;
pub const DISPLAYED_FIELDS_KEY : & str = " displayed-fields " ;
pub const DISTINCT_FIELD_KEY : & str = " distinct-field-key " ;
pub const DOCUMENTS_IDS_KEY : & str = " documents-ids " ;
2022-03-24 00:28:41 +08:00
pub const HIDDEN_FACETED_FIELDS_KEY : & str = " hidden-faceted-fields " ;
2021-06-15 17:06:42 +08:00
pub const FILTERABLE_FIELDS_KEY : & str = " filterable-fields " ;
2021-08-23 17:37:18 +08:00
pub const SORTABLE_FIELDS_KEY : & str = " sortable-fields " ;
2021-06-17 21:16:20 +08:00
pub const FIELD_DISTRIBUTION_KEY : & str = " fields-distribution " ;
2021-06-15 17:06:42 +08:00
pub const FIELDS_IDS_MAP_KEY : & str = " fields-ids-map " ;
2024-05-06 20:49:45 +08:00
pub const FIELDIDS_WEIGHTS_MAP_KEY : & str = " fieldids-weights-map " ;
2021-08-26 23:49:50 +08:00
pub const GEO_FACETED_DOCUMENTS_IDS_KEY : & str = " geo-faceted-documents-ids " ;
pub const GEO_RTREE_KEY : & str = " geo-rtree " ;
2021-06-15 17:06:42 +08:00
pub const PRIMARY_KEY_KEY : & str = " primary-key " ;
pub const SEARCHABLE_FIELDS_KEY : & str = " searchable-fields " ;
2022-05-16 21:22:52 +08:00
pub const USER_DEFINED_SEARCHABLE_FIELDS_KEY : & str = " user-defined-searchable-fields " ;
2021-06-15 17:06:42 +08:00
pub const STOP_WORDS_KEY : & str = " stop-words " ;
2023-07-24 23:00:18 +08:00
pub const NON_SEPARATOR_TOKENS_KEY : & str = " non-separator-tokens " ;
pub const SEPARATOR_TOKENS_KEY : & str = " separator-tokens " ;
pub const DICTIONARY_KEY : & str = " dictionary " ;
2021-06-15 17:06:42 +08:00
pub const SYNONYMS_KEY : & str = " synonyms " ;
2023-07-26 15:33:42 +08:00
pub const USER_DEFINED_SYNONYMS_KEY : & str = " user-defined-synonyms " ;
2021-06-15 17:06:42 +08:00
pub const WORDS_FST_KEY : & str = " words-fst " ;
pub const WORDS_PREFIXES_FST_KEY : & str = " words-prefixes-fst " ;
pub const CREATED_AT_KEY : & str = " created-at " ;
pub const UPDATED_AT_KEY : & str = " updated-at " ;
2022-03-16 17:03:18 +08:00
pub const AUTHORIZE_TYPOS : & str = " authorize-typos " ;
2022-03-21 20:03:06 +08:00
pub const ONE_TYPO_WORD_LEN : & str = " one-typo-word-len " ;
pub const TWO_TYPOS_WORD_LEN : & str = " two-typos-word-len " ;
2022-03-21 21:03:31 +08:00
pub const EXACT_WORDS : & str = " exact-words " ;
2022-03-23 02:07:59 +08:00
pub const EXACT_ATTRIBUTES : & str = " exact-attributes " ;
2022-06-08 23:28:23 +08:00
pub const MAX_VALUES_PER_FACET : & str = " max-values-per-facet " ;
2023-06-22 23:13:40 +08:00
pub const SORT_FACET_VALUES_BY : & str = " sort-facet-values-by " ;
2022-06-22 18:00:45 +08:00
pub const PAGINATION_MAX_TOTAL_HITS : & str = " pagination-max-total-hits " ;
2023-12-06 22:49:02 +08:00
pub const PROXIMITY_PRECISION : & str = " proximity-precision " ;
2023-11-15 22:46:37 +08:00
pub const EMBEDDING_CONFIGS : & str = " embedding_configs " ;
2024-03-12 01:24:21 +08:00
pub const SEARCH_CUTOFF : & str = " search_cutoff " ;
2021-06-15 17:06:42 +08:00
}
pub mod db_name {
pub const MAIN : & str = " main " ;
pub const WORD_DOCIDS : & str = " word-docids " ;
2022-03-24 22:22:57 +08:00
pub const EXACT_WORD_DOCIDS : & str = " exact-word-docids " ;
2021-06-15 17:06:42 +08:00
pub const WORD_PREFIX_DOCIDS : & str = " word-prefix-docids " ;
2022-03-25 17:49:34 +08:00
pub const EXACT_WORD_PREFIX_DOCIDS : & str = " exact-word-prefix-docids " ;
2023-10-28 18:56:46 +08:00
pub const EXTERNAL_DOCUMENTS_IDS : & str = " external-documents-ids " ;
2021-06-15 17:06:42 +08:00
pub const DOCID_WORD_POSITIONS : & str = " docid-word-positions " ;
pub const WORD_PAIR_PROXIMITY_DOCIDS : & str = " word-pair-proximity-docids " ;
2021-10-05 17:18:42 +08:00
pub const WORD_POSITION_DOCIDS : & str = " word-position-docids " ;
2023-03-23 16:22:01 +08:00
pub const WORD_FIELD_ID_DOCIDS : & str = " word-field-id-docids " ;
2021-10-05 17:18:42 +08:00
pub const WORD_PREFIX_POSITION_DOCIDS : & str = " word-prefix-position-docids " ;
2023-03-23 16:22:01 +08:00
pub const WORD_PREFIX_FIELD_ID_DOCIDS : & str = " word-prefix-field-id-docids " ;
2021-06-15 17:06:42 +08:00
pub const FIELD_ID_WORD_COUNT_DOCIDS : & str = " field-id-word-count-docids " ;
pub const FACET_ID_F64_DOCIDS : & str = " facet-id-f64-docids " ;
2022-07-19 15:30:19 +08:00
pub const FACET_ID_EXISTS_DOCIDS : & str = " facet-id-exists-docids " ;
2023-03-08 23:14:00 +08:00
pub const FACET_ID_IS_NULL_DOCIDS : & str = " facet-id-is-null-docids " ;
2023-03-15 01:08:12 +08:00
pub const FACET_ID_IS_EMPTY_DOCIDS : & str = " facet-id-is-empty-docids " ;
2021-06-15 17:06:42 +08:00
pub const FACET_ID_STRING_DOCIDS : & str = " facet-id-string-docids " ;
2023-07-20 23:57:07 +08:00
pub const FACET_ID_NORMALIZED_STRING_STRINGS : & str = " facet-id-normalized-string-strings " ;
2023-05-02 15:34:28 +08:00
pub const FACET_ID_STRING_FST : & str = " facet-id-string-fst " ;
2021-06-15 17:06:42 +08:00
pub const FIELD_ID_DOCID_FACET_F64S : & str = " field-id-docid-facet-f64s " ;
pub const FIELD_ID_DOCID_FACET_STRINGS : & str = " field-id-docid-facet-strings " ;
2023-12-07 20:33:15 +08:00
pub const VECTOR_EMBEDDER_CATEGORY_ID : & str = " vector-embedder-category-id " ;
pub const VECTOR_ARROY : & str = " vector-arroy " ;
2021-06-15 17:06:42 +08:00
pub const DOCUMENTS : & str = " documents " ;
2022-10-12 19:21:35 +08:00
pub const SCRIPT_LANGUAGE_DOCIDS : & str = " script_language_docids " ;
2021-06-15 17:06:42 +08:00
}
2020-10-21 21:55:48 +08:00
#[ derive(Clone) ]
pub struct Index {
2020-10-30 17:56:35 +08:00
/// The LMDB environment which this index is associated with.
2022-06-13 22:39:17 +08:00
pub ( crate ) env : heed ::Env ,
2021-04-21 17:49:26 +08:00
2020-10-22 20:23:33 +08:00
/// Contains many different types (e.g. the fields ids map).
2023-11-23 01:21:19 +08:00
pub ( crate ) main : Database < Unspecified , Unspecified > ,
2021-04-21 17:49:26 +08:00
2023-10-28 18:56:46 +08:00
/// Maps the external documents ids with the internal document id.
2023-11-23 01:21:19 +08:00
pub external_documents_ids : Database < Str , BEU32 > ,
2023-10-28 18:56:46 +08:00
2020-10-21 21:55:48 +08:00
/// A word and all the documents ids containing the word.
2023-09-25 22:39:32 +08:00
pub word_docids : Database < Str , CboRoaringBitmapCodec > ,
2022-03-24 22:22:57 +08:00
/// A word and all the documents ids containing the word, from attributes for which typos are not allowed.
2023-09-25 22:39:32 +08:00
pub exact_word_docids : Database < Str , CboRoaringBitmapCodec > ,
2022-03-24 22:22:57 +08:00
2021-02-03 17:30:33 +08:00
/// A prefix of word and all the documents ids containing this prefix.
2023-09-25 22:39:32 +08:00
pub word_prefix_docids : Database < Str , CboRoaringBitmapCodec > ,
2021-04-21 17:49:26 +08:00
2022-03-25 17:49:34 +08:00
/// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed.
2023-09-25 22:39:32 +08:00
pub exact_word_prefix_docids : Database < Str , CboRoaringBitmapCodec > ,
2022-03-25 17:49:34 +08:00
2020-10-21 21:55:48 +08:00
/// Maps the proximity between a pair of words with all the docids where this relation appears.
2022-09-14 20:01:53 +08:00
pub word_pair_proximity_docids : Database < U8StrStrCodec , CboRoaringBitmapCodec > ,
2021-05-27 21:27:41 +08:00
2021-10-05 17:18:42 +08:00
/// Maps the word and the position with the docids that corresponds to it.
2023-03-23 16:22:01 +08:00
pub word_position_docids : Database < StrBEU16Codec , CboRoaringBitmapCodec > ,
/// Maps the word and the field id with the docids that corresponds to it.
pub word_fid_docids : Database < StrBEU16Codec , CboRoaringBitmapCodec > ,
2021-05-27 21:27:41 +08:00
/// Maps the field id and the word count with the docids that corresponds to it.
pub field_id_word_count_docids : Database < FieldIdWordCountCodec , CboRoaringBitmapCodec > ,
2023-04-24 15:59:30 +08:00
/// Maps the word prefix and a position with all the docids where the prefix appears at the position.
2023-03-23 16:22:01 +08:00
pub word_prefix_position_docids : Database < StrBEU16Codec , CboRoaringBitmapCodec > ,
2023-04-24 15:59:30 +08:00
/// Maps the word prefix and a field id with all the docids where the prefix appears inside the field
2023-03-23 16:22:01 +08:00
pub word_prefix_fid_docids : Database < StrBEU16Codec , CboRoaringBitmapCodec > ,
2021-04-21 17:49:26 +08:00
2022-10-17 19:51:04 +08:00
/// Maps the script and language with all the docids that corresponds to it.
2022-10-12 19:21:35 +08:00
pub script_language_docids : Database < ScriptLanguageCodec , RoaringBitmapCodec > ,
2022-07-19 15:30:19 +08:00
/// Maps the facet field id and the docids for which this field exists
pub facet_id_exists_docids : Database < FieldIdCodec , CboRoaringBitmapCodec > ,
2023-03-08 23:14:00 +08:00
/// Maps the facet field id and the docids for which this field is set as null
pub facet_id_is_null_docids : Database < FieldIdCodec , CboRoaringBitmapCodec > ,
2023-03-15 01:08:12 +08:00
/// Maps the facet field id and the docids for which this field is considered empty
pub facet_id_is_empty_docids : Database < FieldIdCodec , CboRoaringBitmapCodec > ,
2022-07-19 15:30:19 +08:00
2022-08-29 22:01:54 +08:00
/// Maps the facet field id and ranges of numbers with the docids that corresponds to them.
2022-09-05 19:01:36 +08:00
pub facet_id_f64_docids : Database < FacetGroupKeyCodec < OrderedF64Codec > , FacetGroupValueCodec > ,
2022-08-29 22:01:54 +08:00
/// Maps the facet field id and ranges of strings with the docids that corresponds to them.
2022-09-05 19:01:36 +08:00
pub facet_id_string_docids : Database < FacetGroupKeyCodec < StrRefCodec > , FacetGroupValueCodec > ,
2023-07-20 23:57:07 +08:00
/// Maps the facet field id of the normalized-for-search string facets with their original versions.
pub facet_id_normalized_string_strings : Database < BEU16StrCodec , SerdeJson < BTreeSet < String > > > ,
2023-05-02 15:34:28 +08:00
/// Maps the facet field id of the string facets with an FST containing all the facets values.
2023-11-23 01:21:19 +08:00
pub facet_id_string_fst : Database < BEU16 , FstSetCodec > ,
2021-04-21 17:49:26 +08:00
/// Maps the document id, the facet field id and the numbers.
pub field_id_docid_facet_f64s : Database < FieldDocIdFacetF64Codec , Unit > ,
/// Maps the document id, the facet field id and the strings.
2021-07-15 16:19:35 +08:00
pub field_id_docid_facet_strings : Database < FieldDocIdFacetStringCodec , Str > ,
2021-04-21 17:49:26 +08:00
2023-12-07 20:33:15 +08:00
/// Maps an embedder name to its id in the arroy store.
2023-12-08 00:03:10 +08:00
pub embedder_category_id : Database < Str , U8 > ,
2023-12-07 20:33:15 +08:00
/// Vector store based on arroy™.
2023-12-08 00:03:10 +08:00
pub vector_arroy : arroy ::Database < arroy ::distances ::Angular > ,
2023-06-14 20:20:05 +08:00
2020-10-22 20:23:33 +08:00
/// Maps the document id to the document as an obkv store.
2023-11-23 01:21:19 +08:00
pub ( crate ) documents : Database < BEU32 , ObkvCodec > ,
2020-10-21 21:55:48 +08:00
}
impl Index {
2022-10-25 20:37:56 +08:00
pub fn new_with_creation_dates < P : AsRef < Path > > (
mut options : heed ::EnvOpenOptions ,
path : P ,
created_at : OffsetDateTime ,
updated_at : OffsetDateTime ,
) -> Result < Index > {
2021-06-15 17:06:42 +08:00
use db_name ::* ;
2023-12-08 00:03:10 +08:00
options . max_dbs ( 25 ) ;
2020-10-30 17:56:35 +08:00
2024-05-16 22:10:55 +08:00
let env = unsafe { options . open ( path ) } ? ;
2023-05-15 16:15:33 +08:00
let mut wtxn = env . write_txn ( ) ? ;
2023-11-23 01:21:19 +08:00
let main = env . database_options ( ) . name ( MAIN ) . create ( & mut wtxn ) ? ;
2023-05-15 16:15:33 +08:00
let word_docids = env . create_database ( & mut wtxn , Some ( WORD_DOCIDS ) ) ? ;
2023-10-28 18:56:46 +08:00
let external_documents_ids =
env . create_database ( & mut wtxn , Some ( EXTERNAL_DOCUMENTS_IDS ) ) ? ;
2023-05-15 16:15:33 +08:00
let exact_word_docids = env . create_database ( & mut wtxn , Some ( EXACT_WORD_DOCIDS ) ) ? ;
let word_prefix_docids = env . create_database ( & mut wtxn , Some ( WORD_PREFIX_DOCIDS ) ) ? ;
let exact_word_prefix_docids =
env . create_database ( & mut wtxn , Some ( EXACT_WORD_PREFIX_DOCIDS ) ) ? ;
let word_pair_proximity_docids =
env . create_database ( & mut wtxn , Some ( WORD_PAIR_PROXIMITY_DOCIDS ) ) ? ;
let script_language_docids =
env . create_database ( & mut wtxn , Some ( SCRIPT_LANGUAGE_DOCIDS ) ) ? ;
let word_position_docids = env . create_database ( & mut wtxn , Some ( WORD_POSITION_DOCIDS ) ) ? ;
let word_fid_docids = env . create_database ( & mut wtxn , Some ( WORD_FIELD_ID_DOCIDS ) ) ? ;
let field_id_word_count_docids =
env . create_database ( & mut wtxn , Some ( FIELD_ID_WORD_COUNT_DOCIDS ) ) ? ;
let word_prefix_position_docids =
env . create_database ( & mut wtxn , Some ( WORD_PREFIX_POSITION_DOCIDS ) ) ? ;
let word_prefix_fid_docids =
env . create_database ( & mut wtxn , Some ( WORD_PREFIX_FIELD_ID_DOCIDS ) ) ? ;
let facet_id_f64_docids = env . create_database ( & mut wtxn , Some ( FACET_ID_F64_DOCIDS ) ) ? ;
let facet_id_string_docids =
env . create_database ( & mut wtxn , Some ( FACET_ID_STRING_DOCIDS ) ) ? ;
2023-07-20 23:57:07 +08:00
let facet_id_normalized_string_strings =
env . create_database ( & mut wtxn , Some ( FACET_ID_NORMALIZED_STRING_STRINGS ) ) ? ;
2023-05-02 15:34:28 +08:00
let facet_id_string_fst = env . create_database ( & mut wtxn , Some ( FACET_ID_STRING_FST ) ) ? ;
2023-05-15 16:15:33 +08:00
let facet_id_exists_docids =
env . create_database ( & mut wtxn , Some ( FACET_ID_EXISTS_DOCIDS ) ) ? ;
let facet_id_is_null_docids =
env . create_database ( & mut wtxn , Some ( FACET_ID_IS_NULL_DOCIDS ) ) ? ;
let facet_id_is_empty_docids =
env . create_database ( & mut wtxn , Some ( FACET_ID_IS_EMPTY_DOCIDS ) ) ? ;
let field_id_docid_facet_f64s =
env . create_database ( & mut wtxn , Some ( FIELD_ID_DOCID_FACET_F64S ) ) ? ;
2021-06-17 00:33:33 +08:00
let field_id_docid_facet_strings =
2023-05-15 16:15:33 +08:00
env . create_database ( & mut wtxn , Some ( FIELD_ID_DOCID_FACET_STRINGS ) ) ? ;
2023-12-07 20:33:15 +08:00
// vector stuff
let embedder_category_id =
env . create_database ( & mut wtxn , Some ( VECTOR_EMBEDDER_CATEGORY_ID ) ) ? ;
let vector_arroy = env . create_database ( & mut wtxn , Some ( VECTOR_ARROY ) ) ? ;
2023-05-15 16:15:33 +08:00
let documents = env . create_database ( & mut wtxn , Some ( DOCUMENTS ) ) ? ;
wtxn . commit ( ) ? ;
2020-10-30 17:56:35 +08:00
2022-10-25 20:37:56 +08:00
Index ::set_creation_dates ( & env , main , created_at , updated_at ) ? ;
2021-03-12 01:32:04 +08:00
2020-11-11 23:04:04 +08:00
Ok ( Index {
env ,
main ,
2023-10-28 18:56:46 +08:00
external_documents_ids ,
2020-11-11 23:04:04 +08:00
word_docids ,
2022-03-24 22:22:57 +08:00
exact_word_docids ,
2021-02-03 17:30:33 +08:00
word_prefix_docids ,
2022-03-25 17:49:34 +08:00
exact_word_prefix_docids ,
2020-11-11 23:04:04 +08:00
word_pair_proximity_docids ,
2022-10-12 19:21:35 +08:00
script_language_docids ,
2021-10-05 17:18:42 +08:00
word_position_docids ,
2023-03-23 16:22:01 +08:00
word_fid_docids ,
2021-10-05 17:18:42 +08:00
word_prefix_position_docids ,
2023-03-23 16:22:01 +08:00
word_prefix_fid_docids ,
2021-05-27 21:27:41 +08:00
field_id_word_count_docids ,
2021-04-21 17:49:26 +08:00
facet_id_f64_docids ,
facet_id_string_docids ,
2023-07-20 23:57:07 +08:00
facet_id_normalized_string_strings ,
2023-05-02 15:34:28 +08:00
facet_id_string_fst ,
2022-07-19 15:30:19 +08:00
facet_id_exists_docids ,
2023-03-08 23:14:00 +08:00
facet_id_is_null_docids ,
2023-03-15 01:08:12 +08:00
facet_id_is_empty_docids ,
2021-04-21 17:49:26 +08:00
field_id_docid_facet_f64s ,
field_id_docid_facet_strings ,
2023-12-07 20:33:15 +08:00
vector_arroy ,
embedder_category_id ,
2020-11-11 23:04:04 +08:00
documents ,
} )
2020-10-30 17:56:35 +08:00
}
2022-10-25 20:37:56 +08:00
pub fn new < P : AsRef < Path > > ( options : heed ::EnvOpenOptions , path : P ) -> Result < Index > {
let now = OffsetDateTime ::now_utc ( ) ;
2022-10-26 02:58:31 +08:00
Self ::new_with_creation_dates ( options , path , now , now )
2022-10-25 20:37:56 +08:00
}
fn set_creation_dates (
env : & heed ::Env ,
2023-11-23 01:21:19 +08:00
main : Database < Unspecified , Unspecified > ,
2022-10-25 20:37:56 +08:00
created_at : OffsetDateTime ,
updated_at : OffsetDateTime ,
) -> heed ::Result < ( ) > {
2021-04-21 17:49:26 +08:00
let mut txn = env . write_txn ( ) ? ;
// The db was just created, we update its metadata with the relevant information.
2023-11-28 21:27:30 +08:00
let main = main . remap_types ::< Str , SerdeJson < OffsetDateTime > > ( ) ;
if main . get ( & txn , main_key ::CREATED_AT_KEY ) ? . is_none ( ) {
main . put ( & mut txn , main_key ::UPDATED_AT_KEY , & updated_at ) ? ;
main . put ( & mut txn , main_key ::CREATED_AT_KEY , & created_at ) ? ;
2021-04-21 17:49:26 +08:00
txn . commit ( ) ? ;
}
Ok ( ( ) )
}
2020-10-30 17:56:35 +08:00
/// Create a write transaction to be able to write into the index.
2024-07-09 23:25:39 +08:00
pub fn write_txn ( & self ) -> heed ::Result < RwTxn < '_ > > {
2020-10-30 17:56:35 +08:00
self . env . write_txn ( )
}
/// Create a read transaction to be able to read the index.
2024-07-09 23:25:39 +08:00
pub fn read_txn ( & self ) -> heed ::Result < RoTxn < '_ > > {
2020-10-30 17:56:35 +08:00
self . env . read_txn ( )
2020-10-21 21:55:48 +08:00
}
2024-05-16 22:10:55 +08:00
/// Create a static read transaction to be able to read the index without keeping a reference to it.
pub fn static_read_txn ( & self ) -> heed ::Result < RoTxn < 'static > > {
self . env . clone ( ) . static_read_txn ( )
}
2020-10-30 18:46:00 +08:00
/// Returns the canonicalized path where the heed `Env` of this `Index` lives.
pub fn path ( & self ) -> & Path {
self . env . path ( )
}
2022-08-11 17:15:46 +08:00
/// Returns the size used by the index without the cached pages.
pub fn used_size ( & self ) -> Result < u64 > {
Ok ( self . env . non_free_pages_size ( ) ? )
}
/// Returns the real size used by the index.
pub fn on_disk_size ( & self ) -> Result < u64 > {
Ok ( self . env . real_disk_size ( ) ? )
}
2023-01-10 16:46:28 +08:00
/// Returns the map size the underlying environment was opened with, in bytes.
///
/// This value does not represent the current on-disk size of the index.
///
/// This value is the maximum between the map size passed during the opening of the index
/// and the on-disk size of the index at the time of opening.
2023-11-28 21:31:23 +08:00
pub fn map_size ( & self ) -> usize {
self . env . info ( ) . map_size
2023-01-10 16:46:28 +08:00
}
2023-11-28 21:32:30 +08:00
pub fn copy_to_file < P : AsRef < Path > > ( & self , path : P , option : CompactionOption ) -> Result < File > {
2023-11-23 01:21:19 +08:00
self . env . copy_to_file ( path , option ) . map_err ( Into ::into )
2022-06-22 22:23:11 +08:00
}
2020-10-30 18:46:14 +08:00
/// Returns an `EnvClosingEvent` that can be used to wait for the closing event,
/// multiple threads can wait on this event.
///
/// Make sure that you drop all the copies of `Index`es you have, env closing are triggered
/// when all references are dropped, the last one will eventually close the environment.
pub fn prepare_for_closing ( self ) -> heed ::EnvClosingEvent {
self . env . prepare_for_closing ( )
}
2020-11-02 18:48:33 +08:00
/* documents ids */
2020-10-22 21:33:09 +08:00
/// Writes the documents ids that corresponds to the user-ids-documents-ids FST.
2021-06-17 00:33:33 +08:00
pub ( crate ) fn put_documents_ids (
& self ,
2024-07-09 23:25:39 +08:00
wtxn : & mut RwTxn < '_ > ,
2021-06-17 00:33:33 +08:00
docids : & RoaringBitmap ,
) -> heed ::Result < ( ) > {
2023-11-23 01:21:19 +08:00
self . main . remap_types ::< Str , RoaringBitmapCodec > ( ) . put (
wtxn ,
main_key ::DOCUMENTS_IDS_KEY ,
docids ,
)
2020-10-21 21:55:48 +08:00
}
2020-10-22 21:33:09 +08:00
/// Returns the internal documents ids.
2024-07-09 23:25:39 +08:00
pub fn documents_ids ( & self , rtxn : & RoTxn < '_ > ) -> heed ::Result < RoaringBitmap > {
2021-06-17 00:33:33 +08:00
Ok ( self
. main
2023-11-23 01:21:19 +08:00
. remap_types ::< Str , RoaringBitmapCodec > ( )
. get ( rtxn , main_key ::DOCUMENTS_IDS_KEY ) ?
2021-06-17 00:33:33 +08:00
. unwrap_or_default ( ) )
2020-10-21 21:55:48 +08:00
}
2021-03-09 17:24:27 +08:00
/// Returns the number of documents indexed in the database.
2024-07-09 23:25:39 +08:00
pub fn number_of_documents ( & self , rtxn : & RoTxn < '_ > ) -> Result < u64 > {
2023-11-23 01:21:19 +08:00
let count = self
. main
. remap_types ::< Str , RoaringBitmapLenCodec > ( )
. get ( rtxn , main_key ::DOCUMENTS_IDS_KEY ) ? ;
2021-03-09 17:24:27 +08:00
Ok ( count . unwrap_or_default ( ) )
}
2020-11-02 18:48:33 +08:00
/* primary key */
2020-10-31 18:28:48 +08:00
/// Writes the documents primary key, this is the field name that is used to store the id.
2024-07-09 23:25:39 +08:00
pub ( crate ) fn put_primary_key (
& self ,
wtxn : & mut RwTxn < '_ > ,
primary_key : & str ,
) -> heed ::Result < ( ) > {
2022-02-15 18:41:55 +08:00
self . set_updated_at ( wtxn , & OffsetDateTime ::now_utc ( ) ) ? ;
2023-11-23 01:21:19 +08:00
self . main . remap_types ::< Str , Str > ( ) . put ( wtxn , main_key ::PRIMARY_KEY_KEY , primary_key )
2020-10-31 18:28:48 +08:00
}
2020-11-02 00:52:04 +08:00
/// Deletes the primary key of the documents, this can be done to reset indexes settings.
2024-07-09 23:25:39 +08:00
pub ( crate ) fn delete_primary_key ( & self , wtxn : & mut RwTxn < '_ > ) -> heed ::Result < bool > {
2023-11-23 01:21:19 +08:00
self . main . remap_key_type ::< Str > ( ) . delete ( wtxn , main_key ::PRIMARY_KEY_KEY )
2020-10-31 18:28:48 +08:00
}
/// Returns the documents primary key, `None` if it hasn't been defined.
2024-07-09 23:25:39 +08:00
pub fn primary_key < ' t > ( & self , rtxn : & ' t RoTxn < '_ > ) -> heed ::Result < Option < & ' t str > > {
2023-11-23 01:21:19 +08:00
self . main . remap_types ::< Str , Str > ( ) . get ( rtxn , main_key ::PRIMARY_KEY_KEY )
2020-10-31 18:28:48 +08:00
}
2020-11-22 18:54:04 +08:00
/* external documents ids */
2020-11-02 18:48:33 +08:00
2020-11-23 00:28:41 +08:00
/// Returns the external documents ids map which associate the external ids
2020-10-22 21:33:09 +08:00
/// with the internal ids (i.e. `u32`).
2023-10-28 18:56:46 +08:00
pub fn external_documents_ids ( & self ) -> ExternalDocumentsIds {
ExternalDocumentsIds ::new ( self . external_documents_ids )
2020-10-21 21:55:48 +08:00
}
2020-11-02 18:48:33 +08:00
/* fields ids map */
2020-10-22 21:33:09 +08:00
/// Writes the fields ids map which associate the documents keys with an internal field id
/// (i.e. `u8`), this field id is used to identify fields in the obkv documents.
2021-06-17 00:33:33 +08:00
pub ( crate ) fn put_fields_ids_map (
& self ,
2024-07-09 23:25:39 +08:00
wtxn : & mut RwTxn < '_ > ,
2021-06-17 00:33:33 +08:00
map : & FieldsIdsMap ,
) -> heed ::Result < ( ) > {
2023-11-23 01:21:19 +08:00
self . main . remap_types ::< Str , SerdeJson < FieldsIdsMap > > ( ) . put (
wtxn ,
main_key ::FIELDS_IDS_MAP_KEY ,
map ,
)
2020-10-22 21:33:09 +08:00
}
/// Returns the fields ids map which associate the documents keys with an internal field id
/// (i.e. `u8`), this field id is used to identify fields in the obkv documents.
2024-07-09 23:25:39 +08:00
pub fn fields_ids_map ( & self , rtxn : & RoTxn < '_ > ) -> heed ::Result < FieldsIdsMap > {
2021-06-17 00:33:33 +08:00
Ok ( self
. main
2023-11-23 01:21:19 +08:00
. remap_types ::< Str , SerdeJson < FieldsIdsMap > > ( )
. get ( rtxn , main_key ::FIELDS_IDS_MAP_KEY ) ?
2021-06-17 00:33:33 +08:00
. unwrap_or_default ( ) )
2020-10-22 21:33:09 +08:00
}
2024-05-06 20:49:45 +08:00
/* fieldids weights map */
// This maps the fields ids to their weights.
// Their weights is defined by the ordering of the searchable attributes.
/// Writes the fieldids weights map which associates the field ids to their weights
pub ( crate ) fn put_fieldids_weights_map (
& self ,
2024-07-09 23:25:39 +08:00
wtxn : & mut RwTxn < '_ > ,
2024-05-06 20:49:45 +08:00
map : & FieldidsWeightsMap ,
) -> heed ::Result < ( ) > {
self . main . remap_types ::< Str , SerdeJson < _ > > ( ) . put (
wtxn ,
main_key ::FIELDIDS_WEIGHTS_MAP_KEY ,
map ,
)
}
/// Get the fieldids weights map which associates the field ids to their weights
2024-07-09 23:25:39 +08:00
pub fn fieldids_weights_map ( & self , rtxn : & RoTxn < '_ > ) -> heed ::Result < FieldidsWeightsMap > {
2024-05-15 23:16:10 +08:00
self . main
2024-05-06 20:49:45 +08:00
. remap_types ::< Str , SerdeJson < _ > > ( )
. get ( rtxn , main_key ::FIELDIDS_WEIGHTS_MAP_KEY ) ?
2024-05-15 23:16:10 +08:00
. map ( Ok )
. unwrap_or_else ( | | {
Ok ( FieldidsWeightsMap ::from_field_id_map_without_searchable (
& self . fields_ids_map ( rtxn ) ? ,
) )
} )
}
/// Delete the fieldsids weights map
2024-07-09 23:25:39 +08:00
pub fn delete_fieldids_weights_map ( & self , wtxn : & mut RwTxn < '_ > ) -> heed ::Result < bool > {
2024-05-15 23:16:10 +08:00
self . main . remap_key_type ::< Str > ( ) . delete ( wtxn , main_key ::FIELDIDS_WEIGHTS_MAP_KEY )
2024-05-06 20:49:45 +08:00
}
2024-05-07 23:56:40 +08:00
pub fn searchable_fields_and_weights < ' a > (
& self ,
2024-07-09 23:25:39 +08:00
rtxn : & ' a RoTxn < ' a > ,
2024-05-15 21:02:26 +08:00
) -> Result < Vec < ( Cow < ' a , str > , FieldId , Weight ) > > {
2024-05-07 23:56:40 +08:00
let fid_map = self . fields_ids_map ( rtxn ) ? ;
let weight_map = self . fieldids_weights_map ( rtxn ) ? ;
let searchable = self . searchable_fields ( rtxn ) ? ;
2024-05-15 21:02:26 +08:00
searchable
2024-05-07 23:56:40 +08:00
. into_iter ( )
2024-05-15 21:02:26 +08:00
. map ( | field | -> Result < _ > {
let fid = fid_map . id ( & field ) . ok_or_else ( | | FieldIdMapMissingEntry ::FieldName {
field_name : field . to_string ( ) ,
process : " searchable_fields_and_weights " ,
} ) ? ;
let weight = weight_map
. weight ( fid )
. ok_or ( InternalError ::FieldidsWeightsMapMissingEntry { key : fid } ) ? ;
Ok ( ( field , fid , weight ) )
2024-05-07 23:56:40 +08:00
} )
2024-05-15 21:02:26 +08:00
. collect ( )
2024-05-07 23:56:40 +08:00
}
2021-08-23 22:32:11 +08:00
/* geo rtree */
2021-09-09 18:20:08 +08:00
/// Writes the provided `rtree` which associates coordinates to documents ids.
2021-08-24 00:41:48 +08:00
pub ( crate ) fn put_geo_rtree (
2021-08-23 22:32:11 +08:00
& self ,
2024-07-09 23:25:39 +08:00
wtxn : & mut RwTxn < '_ > ,
2021-08-23 22:32:11 +08:00
rtree : & RTree < GeoPoint > ,
) -> heed ::Result < ( ) > {
2023-11-23 01:21:19 +08:00
self . main . remap_types ::< Str , SerdeBincode < RTree < GeoPoint > > > ( ) . put (
wtxn ,
main_key ::GEO_RTREE_KEY ,
rtree ,
)
2021-08-23 22:32:11 +08:00
}
2021-09-09 18:20:08 +08:00
/// Delete the `rtree` which associates coordinates to documents ids.
2024-07-09 23:25:39 +08:00
pub ( crate ) fn delete_geo_rtree ( & self , wtxn : & mut RwTxn < '_ > ) -> heed ::Result < bool > {
2023-11-23 01:21:19 +08:00
self . main . remap_key_type ::< Str > ( ) . delete ( wtxn , main_key ::GEO_RTREE_KEY )
2021-08-23 22:32:11 +08:00
}
2021-09-09 18:20:08 +08:00
/// Returns the `rtree` which associates coordinates to documents ids.
2024-07-09 23:25:39 +08:00
pub fn geo_rtree ( & self , rtxn : & RoTxn < '_ > ) -> Result < Option < RTree < GeoPoint > > > {
2021-08-23 22:32:11 +08:00
match self
. main
2023-11-23 01:21:19 +08:00
. remap_types ::< Str , SerdeBincode < RTree < GeoPoint > > > ( )
. get ( rtxn , main_key ::GEO_RTREE_KEY ) ?
2021-08-23 22:32:11 +08:00
{
Some ( rtree ) = > Ok ( Some ( rtree ) ) ,
None = > Ok ( None ) ,
}
}
2021-08-26 23:49:50 +08:00
/* geo faceted */
2021-09-09 18:20:08 +08:00
/// Writes the documents ids that are faceted with a _geo field.
2021-08-26 23:49:50 +08:00
pub ( crate ) fn put_geo_faceted_documents_ids (
& self ,
2024-07-09 23:25:39 +08:00
wtxn : & mut RwTxn < '_ > ,
2021-08-26 23:49:50 +08:00
docids : & RoaringBitmap ,
) -> heed ::Result < ( ) > {
2023-11-23 01:21:19 +08:00
self . main . remap_types ::< Str , RoaringBitmapCodec > ( ) . put (
2021-08-26 23:49:50 +08:00
wtxn ,
main_key ::GEO_FACETED_DOCUMENTS_IDS_KEY ,
docids ,
)
}
2021-09-09 18:20:08 +08:00
/// Delete the documents ids that are faceted with a _geo field.
2024-07-09 23:25:39 +08:00
pub ( crate ) fn delete_geo_faceted_documents_ids (
& self ,
wtxn : & mut RwTxn < '_ > ,
) -> heed ::Result < bool > {
2023-11-23 01:21:19 +08:00
self . main . remap_key_type ::< Str > ( ) . delete ( wtxn , main_key ::GEO_FACETED_DOCUMENTS_IDS_KEY )
2021-08-26 23:49:50 +08:00
}
2021-09-20 22:10:39 +08:00
/// Retrieve all the documents ids that are faceted with a _geo field.
2024-07-09 23:25:39 +08:00
pub fn geo_faceted_documents_ids ( & self , rtxn : & RoTxn < '_ > ) -> heed ::Result < RoaringBitmap > {
2021-08-26 23:49:50 +08:00
match self
. main
2023-11-23 01:21:19 +08:00
. remap_types ::< Str , RoaringBitmapCodec > ( )
. get ( rtxn , main_key ::GEO_FACETED_DOCUMENTS_IDS_KEY ) ?
2021-08-26 23:49:50 +08:00
{
Some ( docids ) = > Ok ( docids ) ,
None = > Ok ( RoaringBitmap ::new ( ) ) ,
}
}
2021-06-17 21:16:20 +08:00
/* field distribution */
2021-03-31 23:14:23 +08:00
2021-06-17 21:16:20 +08:00
/// Writes the field distribution which associates every field name with
2021-04-01 15:07:16 +08:00
/// the number of times it occurs in the documents.
2021-06-17 21:16:20 +08:00
pub ( crate ) fn put_field_distribution (
2021-06-17 00:33:33 +08:00
& self ,
2024-07-09 23:25:39 +08:00
wtxn : & mut RwTxn < '_ > ,
2021-06-21 21:57:41 +08:00
distribution : & FieldDistribution ,
2021-06-17 00:33:33 +08:00
) -> heed ::Result < ( ) > {
2023-11-23 01:21:19 +08:00
self . main . remap_types ::< Str , SerdeJson < FieldDistribution > > ( ) . put (
2021-06-17 00:33:33 +08:00
wtxn ,
2021-06-17 21:16:20 +08:00
main_key ::FIELD_DISTRIBUTION_KEY ,
2021-06-17 00:33:33 +08:00
distribution ,
)
2021-03-31 23:14:23 +08:00
}
2021-03-31 23:14:23 +08:00
2021-06-17 21:16:20 +08:00
/// Returns the field distribution which associates every field name with
2021-04-01 15:07:16 +08:00
/// the number of times it occurs in the documents.
2024-07-09 23:25:39 +08:00
pub fn field_distribution ( & self , rtxn : & RoTxn < '_ > ) -> heed ::Result < FieldDistribution > {
2021-06-17 00:33:33 +08:00
Ok ( self
. main
2023-11-23 01:21:19 +08:00
. remap_types ::< Str , SerdeJson < FieldDistribution > > ( )
. get ( rtxn , main_key ::FIELD_DISTRIBUTION_KEY ) ?
2021-06-17 00:33:33 +08:00
. unwrap_or_default ( ) )
2021-03-31 23:14:23 +08:00
}
2020-11-02 18:48:33 +08:00
/* displayed fields */
2021-01-21 00:27:43 +08:00
/// Writes the fields that must be displayed in the defined order.
2020-11-02 18:45:16 +08:00
/// There must be not be any duplicate field id.
2021-06-17 00:33:33 +08:00
pub ( crate ) fn put_displayed_fields (
& self ,
2024-07-09 23:25:39 +08:00
wtxn : & mut RwTxn < '_ > ,
2021-06-17 00:33:33 +08:00
fields : & [ & str ] ,
) -> heed ::Result < ( ) > {
2023-11-23 01:21:19 +08:00
self . main . remap_types ::< Str , SerdeBincode < & [ & str ] > > ( ) . put (
2021-06-17 00:33:33 +08:00
wtxn ,
main_key ::DISPLAYED_FIELDS_KEY ,
& fields ,
)
2020-11-02 18:45:16 +08:00
}
/// Deletes the displayed fields ids, this will make the engine to display
/// all the documents attributes in the order of the `FieldsIdsMap`.
2024-07-09 23:25:39 +08:00
pub ( crate ) fn delete_displayed_fields ( & self , wtxn : & mut RwTxn < '_ > ) -> heed ::Result < bool > {
2023-11-23 01:21:19 +08:00
self . main . remap_key_type ::< Str > ( ) . delete ( wtxn , main_key ::DISPLAYED_FIELDS_KEY )
2020-11-02 18:45:16 +08:00
}
2021-01-21 00:27:43 +08:00
/// Returns the displayed fields in the order they were set by the user. If it returns
/// `None` it means that all the attributes are set as displayed in the order of the `FieldsIdsMap`.
2024-07-09 23:25:39 +08:00
pub fn displayed_fields < ' t > ( & self , rtxn : & ' t RoTxn < '_ > ) -> heed ::Result < Option < Vec < & ' t str > > > {
2023-11-23 01:21:19 +08:00
self . main
. remap_types ::< Str , SerdeBincode < Vec < & ' t str > > > ( )
. get ( rtxn , main_key ::DISPLAYED_FIELDS_KEY )
2021-01-21 00:27:43 +08:00
}
2021-06-15 17:51:32 +08:00
/// Identical to `displayed_fields`, but returns the ids instead.
2024-07-09 23:25:39 +08:00
pub fn displayed_fields_ids ( & self , rtxn : & RoTxn < '_ > ) -> Result < Option < Vec < FieldId > > > {
2021-06-15 17:51:32 +08:00
match self . displayed_fields ( rtxn ) ? {
Some ( fields ) = > {
let fields_ids_map = self . fields_ids_map ( rtxn ) ? ;
let mut fields_ids = Vec ::new ( ) ;
for name in fields . into_iter ( ) {
2021-07-22 23:11:17 +08:00
if let Some ( field_id ) = fields_ids_map . id ( name ) {
fields_ids . push ( field_id ) ;
2021-06-15 17:51:32 +08:00
}
}
Ok ( Some ( fields_ids ) )
2021-06-17 00:33:33 +08:00
}
2021-06-15 17:51:32 +08:00
None = > Ok ( None ) ,
}
2020-11-02 18:45:16 +08:00
}
2023-07-24 17:20:07 +08:00
/* remove hidden fields */
pub fn remove_hidden_fields (
& self ,
2024-07-09 23:25:39 +08:00
rtxn : & RoTxn < '_ > ,
2023-07-24 17:20:07 +08:00
fields : impl IntoIterator < Item = impl AsRef < str > > ,
) -> Result < ( BTreeSet < String > , bool ) > {
let mut valid_fields =
fields . into_iter ( ) . map ( | f | f . as_ref ( ) . to_string ( ) ) . collect ::< BTreeSet < String > > ( ) ;
let fields_len = valid_fields . len ( ) ;
if let Some ( dn ) = self . displayed_fields ( rtxn ) ? {
let displayable_names = dn . iter ( ) . map ( | s | s . to_string ( ) ) . collect ( ) ;
valid_fields = & valid_fields & & displayable_names ;
}
let hidden_fields = fields_len > valid_fields . len ( ) ;
Ok ( ( valid_fields , hidden_fields ) )
}
2020-11-02 00:52:04 +08:00
/* searchable fields */
2022-05-16 21:22:52 +08:00
/// Write the user defined searchable fields and generate the real searchable fields from the specified fields ids map.
pub ( crate ) fn put_all_searchable_fields_from_fields_ids_map (
2021-06-17 00:33:33 +08:00
& self ,
2024-07-09 23:25:39 +08:00
wtxn : & mut RwTxn < '_ > ,
2024-05-15 23:16:10 +08:00
user_fields : & [ & str ] ,
2024-05-21 23:08:45 +08:00
non_searchable_fields_ids : & [ FieldId ] ,
2022-05-16 21:22:52 +08:00
fields_ids_map : & FieldsIdsMap ,
2024-05-06 20:49:45 +08:00
) -> Result < ( ) > {
2022-05-16 21:22:52 +08:00
// We can write the user defined searchable fields as-is.
self . put_user_defined_searchable_fields ( wtxn , user_fields ) ? ;
2024-05-15 23:16:10 +08:00
let mut weights = FieldidsWeightsMap ::default ( ) ;
2024-05-06 20:49:45 +08:00
2022-05-16 21:22:52 +08:00
// Now we generate the real searchable fields:
// 1. Take the user defined searchable fields as-is to keep the priority defined by the attributes criterion.
// 2. Iterate over the user defined searchable fields.
// 3. If a user defined field is a subset of a field defined in the fields_ids_map
2024-05-07 23:56:40 +08:00
// (ie doggo.name is a subset of doggo) right after doggo and with the same weight.
let mut real_fields = Vec ::new ( ) ;
2022-05-16 21:22:52 +08:00
2024-05-06 20:49:45 +08:00
for ( id , field_from_map ) in fields_ids_map . iter ( ) {
for ( weight , user_field ) in user_fields . iter ( ) . enumerate ( ) {
2022-05-16 21:22:52 +08:00
if crate ::is_faceted_by ( field_from_map , user_field )
2024-05-07 23:56:40 +08:00
& & ! real_fields . contains ( & field_from_map )
2024-05-21 23:08:45 +08:00
& & ! non_searchable_fields_ids . contains ( & id )
2022-05-16 21:22:52 +08:00
{
real_fields . push ( field_from_map ) ;
2024-05-06 20:49:45 +08:00
let weight : u16 =
weight . try_into ( ) . map_err ( | _ | UserError ::AttributeLimitReached ) ? ;
2024-05-14 23:20:57 +08:00
weights . insert ( id , weight ) ;
2022-05-16 21:22:52 +08:00
}
}
}
2024-05-06 20:49:45 +08:00
self . put_searchable_fields ( wtxn , & real_fields ) ? ;
self . put_fieldids_weights_map ( wtxn , & weights ) ? ;
Ok ( ( ) )
2022-05-16 21:22:52 +08:00
}
2024-07-09 23:25:39 +08:00
pub ( crate ) fn delete_all_searchable_fields ( & self , wtxn : & mut RwTxn < '_ > ) -> heed ::Result < bool > {
2022-11-29 02:12:19 +08:00
let did_delete_searchable = self . delete_searchable_fields ( wtxn ) ? ;
let did_delete_user_defined = self . delete_user_defined_searchable_fields ( wtxn ) ? ;
2024-05-15 23:16:10 +08:00
self . delete_fieldids_weights_map ( wtxn ) ? ;
2022-11-29 02:12:19 +08:00
Ok ( did_delete_searchable | | did_delete_user_defined )
2022-05-16 21:22:52 +08:00
}
/// Writes the searchable fields, when this list is specified, only these are indexed.
2024-07-09 23:25:39 +08:00
fn put_searchable_fields ( & self , wtxn : & mut RwTxn < '_ > , fields : & [ & str ] ) -> heed ::Result < ( ) > {
2023-11-23 01:21:19 +08:00
self . main . remap_types ::< Str , SerdeBincode < & [ & str ] > > ( ) . put (
2021-06-17 00:33:33 +08:00
wtxn ,
main_key ::SEARCHABLE_FIELDS_KEY ,
& fields ,
)
2020-11-02 00:52:04 +08:00
}
/// Deletes the searchable fields, when no fields are specified, all fields are indexed.
2024-07-09 23:25:39 +08:00
fn delete_searchable_fields ( & self , wtxn : & mut RwTxn < '_ > ) -> heed ::Result < bool > {
2023-11-23 01:21:19 +08:00
self . main . remap_key_type ::< Str > ( ) . delete ( wtxn , main_key ::SEARCHABLE_FIELDS_KEY )
2020-11-02 00:52:04 +08:00
}
2021-01-21 00:27:43 +08:00
/// Returns the searchable fields, those are the fields that are indexed,
2024-07-09 23:25:39 +08:00
pub fn searchable_fields < ' t > ( & self , rtxn : & ' t RoTxn < '_ > ) -> heed ::Result < Vec < Cow < ' t , str > > > {
2023-11-23 01:21:19 +08:00
self . main
. remap_types ::< Str , SerdeBincode < Vec < & ' t str > > > ( )
2024-05-06 20:49:45 +08:00
. get ( rtxn , main_key ::SEARCHABLE_FIELDS_KEY ) ?
2024-05-14 23:20:57 +08:00
. map ( | fields | Ok ( fields . into_iter ( ) . map ( Cow ::Borrowed ) . collect ( ) ) )
2024-05-06 20:49:45 +08:00
. unwrap_or_else ( | | {
Ok ( self
. fields_ids_map ( rtxn ) ?
. names ( )
2024-05-21 23:08:45 +08:00
. filter ( | name | ! crate ::is_faceted_by ( name , RESERVED_VECTORS_FIELD_NAME ) )
2024-05-06 20:49:45 +08:00
. map ( | field | Cow ::Owned ( field . to_string ( ) ) )
. collect ( ) )
} )
2021-01-21 00:27:43 +08:00
}
/// Identical to `searchable_fields`, but returns the ids instead.
2024-07-09 23:25:39 +08:00
pub fn searchable_fields_ids ( & self , rtxn : & RoTxn < '_ > ) -> Result < Vec < FieldId > > {
2024-05-06 20:49:45 +08:00
let fields = self . searchable_fields ( rtxn ) ? ;
let fields_ids_map = self . fields_ids_map ( rtxn ) ? ;
let mut fields_ids = Vec ::new ( ) ;
for name in fields {
if let Some ( field_id ) = fields_ids_map . id ( & name ) {
fields_ids . push ( field_id ) ;
2021-06-17 00:33:33 +08:00
}
2021-01-21 00:27:43 +08:00
}
2024-05-06 20:49:45 +08:00
Ok ( fields_ids )
2020-11-02 00:52:04 +08:00
}
2022-05-16 21:22:52 +08:00
/// Writes the searchable fields, when this list is specified, only these are indexed.
pub ( crate ) fn put_user_defined_searchable_fields (
& self ,
2024-07-09 23:25:39 +08:00
wtxn : & mut RwTxn < '_ > ,
2022-05-16 21:22:52 +08:00
fields : & [ & str ] ,
) -> heed ::Result < ( ) > {
2023-11-23 01:21:19 +08:00
self . main . remap_types ::< Str , SerdeBincode < _ > > ( ) . put (
2022-05-16 21:22:52 +08:00
wtxn ,
main_key ::USER_DEFINED_SEARCHABLE_FIELDS_KEY ,
& fields ,
)
}
/// Deletes the searchable fields, when no fields are specified, all fields are indexed.
pub ( crate ) fn delete_user_defined_searchable_fields (
& self ,
2024-07-09 23:25:39 +08:00
wtxn : & mut RwTxn < '_ > ,
2022-05-16 21:22:52 +08:00
) -> heed ::Result < bool > {
2023-11-23 01:21:19 +08:00
self . main . remap_key_type ::< Str > ( ) . delete ( wtxn , main_key ::USER_DEFINED_SEARCHABLE_FIELDS_KEY )
2022-05-16 21:22:52 +08:00
}
/// Returns the user defined searchable fields.
pub fn user_defined_searchable_fields < ' t > (
& self ,
2024-07-09 23:25:39 +08:00
rtxn : & ' t RoTxn < ' t > ,
2022-05-16 21:22:52 +08:00
) -> heed ::Result < Option < Vec < & ' t str > > > {
self . main
2023-11-23 01:21:19 +08:00
. remap_types ::< Str , SerdeBincode < Vec < _ > > > ( )
. get ( rtxn , main_key ::USER_DEFINED_SEARCHABLE_FIELDS_KEY )
2022-05-16 21:22:52 +08:00
}
2024-03-26 20:27:43 +08:00
/// Identical to `user_defined_searchable_fields`, but returns ids instead.
2024-07-09 23:25:39 +08:00
pub fn user_defined_searchable_fields_ids (
& self ,
rtxn : & RoTxn < '_ > ,
) -> Result < Option < Vec < FieldId > > > {
2024-03-26 20:27:43 +08:00
match self . user_defined_searchable_fields ( rtxn ) ? {
Some ( fields ) = > {
let fields_ids_map = self . fields_ids_map ( rtxn ) ? ;
let mut fields_ids = Vec ::new ( ) ;
for name in fields {
if let Some ( field_id ) = fields_ids_map . id ( name ) {
fields_ids . push ( field_id ) ;
}
}
Ok ( Some ( fields_ids ) )
}
None = > Ok ( None ) ,
}
}
2021-06-01 18:19:55 +08:00
/* filterable fields */
2020-11-11 23:17:37 +08:00
2021-06-01 18:19:55 +08:00
/// Writes the filterable fields names in the database.
2021-06-17 00:33:33 +08:00
pub ( crate ) fn put_filterable_fields (
& self ,
2024-07-09 23:25:39 +08:00
wtxn : & mut RwTxn < '_ > ,
2021-06-17 00:33:33 +08:00
fields : & HashSet < String > ,
) -> heed ::Result < ( ) > {
2023-11-23 01:21:19 +08:00
self . main . remap_types ::< Str , SerdeJson < _ > > ( ) . put (
wtxn ,
main_key ::FILTERABLE_FIELDS_KEY ,
fields ,
)
2020-11-11 23:17:37 +08:00
}
2021-06-01 18:19:55 +08:00
/// Deletes the filterable fields ids in the database.
2024-07-09 23:25:39 +08:00
pub ( crate ) fn delete_filterable_fields ( & self , wtxn : & mut RwTxn < '_ > ) -> heed ::Result < bool > {
2023-11-23 01:21:19 +08:00
self . main . remap_key_type ::< Str > ( ) . delete ( wtxn , main_key ::FILTERABLE_FIELDS_KEY )
2020-11-11 23:17:37 +08:00
}
2021-06-01 18:19:55 +08:00
/// Returns the filterable fields names.
2024-07-09 23:25:39 +08:00
pub fn filterable_fields ( & self , rtxn : & RoTxn < '_ > ) -> heed ::Result < HashSet < String > > {
2021-06-17 00:33:33 +08:00
Ok ( self
. main
2023-11-23 01:21:19 +08:00
. remap_types ::< Str , SerdeJson < _ > > ( )
. get ( rtxn , main_key ::FILTERABLE_FIELDS_KEY ) ?
2021-06-17 00:33:33 +08:00
. unwrap_or_default ( ) )
2021-06-01 18:19:55 +08:00
}
2021-06-15 17:51:32 +08:00
/// Identical to `filterable_fields`, but returns ids instead.
2024-07-09 23:25:39 +08:00
pub fn filterable_fields_ids ( & self , rtxn : & RoTxn < '_ > ) -> Result < HashSet < FieldId > > {
2021-06-15 17:51:32 +08:00
let fields = self . filterable_fields ( rtxn ) ? ;
2021-06-01 18:19:55 +08:00
let fields_ids_map = self . fields_ids_map ( rtxn ) ? ;
2021-06-15 17:51:32 +08:00
let mut fields_ids = HashSet ::new ( ) ;
for name in fields {
2021-07-22 23:11:17 +08:00
if let Some ( field_id ) = fields_ids_map . id ( & name ) {
fields_ids . insert ( field_id ) ;
2021-06-15 17:51:32 +08:00
}
}
Ok ( fields_ids )
2021-06-01 18:19:55 +08:00
}
2021-06-01 18:20:29 +08:00
2021-08-23 17:37:18 +08:00
/* sortable fields */
/// Writes the sortable fields names in the database.
pub ( crate ) fn put_sortable_fields (
& self ,
2024-07-09 23:25:39 +08:00
wtxn : & mut RwTxn < '_ > ,
2021-08-23 17:37:18 +08:00
fields : & HashSet < String > ,
) -> heed ::Result < ( ) > {
2023-11-23 01:21:19 +08:00
self . main . remap_types ::< Str , SerdeJson < _ > > ( ) . put (
wtxn ,
main_key ::SORTABLE_FIELDS_KEY ,
fields ,
)
2021-08-23 17:37:18 +08:00
}
/// Deletes the sortable fields ids in the database.
2024-07-09 23:25:39 +08:00
pub ( crate ) fn delete_sortable_fields ( & self , wtxn : & mut RwTxn < '_ > ) -> heed ::Result < bool > {
2023-11-23 01:21:19 +08:00
self . main . remap_key_type ::< Str > ( ) . delete ( wtxn , main_key ::SORTABLE_FIELDS_KEY )
2021-08-23 17:37:18 +08:00
}
/// Returns the sortable fields names.
2024-07-09 23:25:39 +08:00
pub fn sortable_fields ( & self , rtxn : & RoTxn < '_ > ) -> heed ::Result < HashSet < String > > {
2021-08-23 17:37:18 +08:00
Ok ( self
. main
2023-11-23 01:21:19 +08:00
. remap_types ::< Str , SerdeJson < _ > > ( )
. get ( rtxn , main_key ::SORTABLE_FIELDS_KEY ) ?
2021-08-23 17:37:18 +08:00
. unwrap_or_default ( ) )
}
/// Identical to `sortable_fields`, but returns ids instead.
2024-07-09 23:25:39 +08:00
pub fn sortable_fields_ids ( & self , rtxn : & RoTxn < '_ > ) -> Result < HashSet < FieldId > > {
2021-08-23 17:37:18 +08:00
let fields = self . sortable_fields ( rtxn ) ? ;
let fields_ids_map = self . fields_ids_map ( rtxn ) ? ;
Ok ( fields . into_iter ( ) . filter_map ( | name | fields_ids_map . id ( & name ) ) . collect ( ) )
}
2022-03-24 00:28:41 +08:00
/* faceted fields */
/// Writes the faceted fields in the database.
pub ( crate ) fn put_faceted_fields (
& self ,
2024-07-09 23:25:39 +08:00
wtxn : & mut RwTxn < '_ > ,
2022-03-24 00:28:41 +08:00
fields : & HashSet < String > ,
) -> heed ::Result < ( ) > {
2023-11-23 01:21:19 +08:00
self . main . remap_types ::< Str , SerdeJson < _ > > ( ) . put (
wtxn ,
main_key ::HIDDEN_FACETED_FIELDS_KEY ,
fields ,
)
2022-03-24 00:28:41 +08:00
}
2021-06-01 18:20:29 +08:00
/// Returns the faceted fields names.
2024-07-09 23:25:39 +08:00
pub fn faceted_fields ( & self , rtxn : & RoTxn < '_ > ) -> heed ::Result < HashSet < String > > {
2022-03-24 00:28:41 +08:00
Ok ( self
. main
2023-11-23 01:21:19 +08:00
. remap_types ::< Str , SerdeJson < _ > > ( )
. get ( rtxn , main_key ::HIDDEN_FACETED_FIELDS_KEY ) ?
2022-03-24 00:28:41 +08:00
. unwrap_or_default ( ) )
}
/// Identical to `faceted_fields`, but returns ids instead.
2024-07-09 23:25:39 +08:00
pub fn faceted_fields_ids ( & self , rtxn : & RoTxn < '_ > ) -> Result < HashSet < FieldId > > {
2022-03-24 00:28:41 +08:00
let fields = self . faceted_fields ( rtxn ) ? ;
let fields_ids_map = self . fields_ids_map ( rtxn ) ? ;
let mut fields_ids = HashSet ::new ( ) ;
for name in fields {
if let Some ( field_id ) = fields_ids_map . id ( & name ) {
fields_ids . insert ( field_id ) ;
}
}
Ok ( fields_ids )
}
/* faceted documents ids */
/// Returns the user defined faceted fields names.
2021-06-01 18:20:29 +08:00
///
2022-03-24 00:28:41 +08:00
/// The user faceted fields are the union of all the filterable, sortable, distinct, and Asc/Desc fields.
2024-07-09 23:25:39 +08:00
pub fn user_defined_faceted_fields ( & self , rtxn : & RoTxn < '_ > ) -> Result < HashSet < String > > {
2021-06-01 18:20:29 +08:00
let filterable_fields = self . filterable_fields ( rtxn ) ? ;
2021-08-23 17:37:18 +08:00
let sortable_fields = self . sortable_fields ( rtxn ) ? ;
2021-06-01 22:29:14 +08:00
let distinct_field = self . distinct_field ( rtxn ) ? ;
2021-06-17 00:33:33 +08:00
let asc_desc_fields =
self . criteria ( rtxn ) ? . into_iter ( ) . filter_map ( | criterion | match criterion {
2021-06-01 18:20:29 +08:00
Criterion ::Asc ( field ) | Criterion ::Desc ( field ) = > Some ( field ) ,
_otherwise = > None ,
} ) ;
let mut faceted_fields = filterable_fields ;
2021-08-23 17:37:18 +08:00
faceted_fields . extend ( sortable_fields ) ;
2021-06-01 18:20:29 +08:00
faceted_fields . extend ( asc_desc_fields ) ;
if let Some ( field ) = distinct_field {
faceted_fields . insert ( field . to_owned ( ) ) ;
}
Ok ( faceted_fields )
2021-01-21 00:27:43 +08:00
}
2022-03-24 00:28:41 +08:00
/// Identical to `user_defined_faceted_fields`, but returns ids instead.
2024-07-09 23:25:39 +08:00
pub fn user_defined_faceted_fields_ids ( & self , rtxn : & RoTxn < '_ > ) -> Result < HashSet < FieldId > > {
2024-03-26 20:27:43 +08:00
let fields = self . user_defined_faceted_fields ( rtxn ) ? ;
2021-01-21 00:27:43 +08:00
let fields_ids_map = self . fields_ids_map ( rtxn ) ? ;
2021-04-28 23:58:16 +08:00
2021-06-15 17:51:32 +08:00
let mut fields_ids = HashSet ::new ( ) ;
2024-03-26 20:27:43 +08:00
for name in fields {
2021-07-22 23:11:17 +08:00
if let Some ( field_id ) = fields_ids_map . id ( & name ) {
fields_ids . insert ( field_id ) ;
2021-06-15 17:51:32 +08:00
}
}
Ok ( fields_ids )
2020-11-11 23:17:37 +08:00
}
2020-11-23 20:08:57 +08:00
/* faceted documents ids */
2023-03-08 23:57:42 +08:00
/// Retrieve all the documents which contain this field id set as null
pub fn null_faceted_documents_ids (
& self ,
2024-07-09 23:25:39 +08:00
rtxn : & RoTxn < '_ > ,
2023-03-08 23:57:42 +08:00
field_id : FieldId ,
) -> heed ::Result < RoaringBitmap > {
2023-11-23 01:21:19 +08:00
match self . facet_id_is_null_docids . get ( rtxn , & field_id ) ? {
2023-03-08 23:57:42 +08:00
Some ( docids ) = > Ok ( docids ) ,
None = > Ok ( RoaringBitmap ::new ( ) ) ,
}
}
2023-03-15 01:08:12 +08:00
/// Retrieve all the documents which contain this field id and that is considered empty
pub fn empty_faceted_documents_ids (
& self ,
2024-07-09 23:25:39 +08:00
rtxn : & RoTxn < '_ > ,
2023-03-15 01:08:12 +08:00
field_id : FieldId ,
) -> heed ::Result < RoaringBitmap > {
2023-11-23 01:21:19 +08:00
match self . facet_id_is_empty_docids . get ( rtxn , & field_id ) ? {
2023-03-15 01:08:12 +08:00
Some ( docids ) = > Ok ( docids ) ,
None = > Ok ( RoaringBitmap ::new ( ) ) ,
}
}
2022-07-19 15:30:19 +08:00
/// Retrieve all the documents which contain this field id
pub fn exists_faceted_documents_ids (
& self ,
2024-07-09 23:25:39 +08:00
rtxn : & RoTxn < '_ > ,
2022-07-19 15:30:19 +08:00
field_id : FieldId ,
) -> heed ::Result < RoaringBitmap > {
2023-11-23 01:21:19 +08:00
match self . facet_id_exists_docids . get ( rtxn , & field_id ) ? {
2022-07-19 15:30:19 +08:00
Some ( docids ) = > Ok ( docids ) ,
None = > Ok ( RoaringBitmap ::new ( ) ) ,
2020-11-23 20:08:57 +08:00
}
}
2021-06-01 22:29:14 +08:00
/* distinct field */
2021-04-14 18:00:45 +08:00
2021-06-17 00:33:33 +08:00
pub ( crate ) fn put_distinct_field (
& self ,
2024-07-09 23:25:39 +08:00
wtxn : & mut RwTxn < '_ > ,
2021-06-17 00:33:33 +08:00
distinct_field : & str ,
) -> heed ::Result < ( ) > {
2023-11-23 01:21:19 +08:00
self . main . remap_types ::< Str , Str > ( ) . put ( wtxn , main_key ::DISTINCT_FIELD_KEY , distinct_field )
2021-04-14 18:00:45 +08:00
}
2024-07-09 23:25:39 +08:00
pub fn distinct_field < ' a > ( & self , rtxn : & ' a RoTxn < '_ > ) -> heed ::Result < Option < & ' a str > > {
2023-11-23 01:21:19 +08:00
self . main . remap_types ::< Str , Str > ( ) . get ( rtxn , main_key ::DISTINCT_FIELD_KEY )
2021-04-14 18:00:45 +08:00
}
2024-07-09 23:25:39 +08:00
pub ( crate ) fn delete_distinct_field ( & self , wtxn : & mut RwTxn < '_ > ) -> heed ::Result < bool > {
2023-11-23 01:21:19 +08:00
self . main . remap_key_type ::< Str > ( ) . delete ( wtxn , main_key ::DISTINCT_FIELD_KEY )
2021-04-14 18:00:45 +08:00
}
2020-11-27 19:14:56 +08:00
/* criteria */
2021-06-17 00:33:33 +08:00
pub ( crate ) fn put_criteria (
& self ,
2024-07-09 23:25:39 +08:00
wtxn : & mut RwTxn < '_ > ,
2021-06-17 00:33:33 +08:00
criteria : & [ Criterion ] ,
) -> heed ::Result < ( ) > {
2023-11-23 01:21:19 +08:00
self . main . remap_types ::< Str , SerdeJson < & [ Criterion ] > > ( ) . put (
wtxn ,
main_key ::CRITERIA_KEY ,
& criteria ,
)
2020-11-27 19:14:56 +08:00
}
2024-07-09 23:25:39 +08:00
pub ( crate ) fn delete_criteria ( & self , wtxn : & mut RwTxn < '_ > ) -> heed ::Result < bool > {
2023-11-23 01:21:19 +08:00
self . main . remap_key_type ::< Str > ( ) . delete ( wtxn , main_key ::CRITERIA_KEY )
2020-11-27 19:14:56 +08:00
}
2024-07-09 23:25:39 +08:00
pub fn criteria ( & self , rtxn : & RoTxn < '_ > ) -> heed ::Result < Vec < Criterion > > {
2023-11-23 01:21:19 +08:00
match self
. main
. remap_types ::< Str , SerdeJson < Vec < Criterion > > > ( )
. get ( rtxn , main_key ::CRITERIA_KEY ) ?
{
2020-11-27 19:14:56 +08:00
Some ( criteria ) = > Ok ( criteria ) ,
None = > Ok ( default_criteria ( ) ) ,
}
}
2020-11-02 18:48:33 +08:00
/* words fst */
2021-04-07 16:53:57 +08:00
/// Writes the FST which is the words dictionary of the engine.
2021-06-17 00:33:33 +08:00
pub ( crate ) fn put_words_fst < A : AsRef < [ u8 ] > > (
& self ,
2024-07-09 23:25:39 +08:00
wtxn : & mut RwTxn < '_ > ,
2021-06-17 00:33:33 +08:00
fst : & fst ::Set < A > ,
) -> heed ::Result < ( ) > {
2023-11-27 18:52:22 +08:00
self . main . remap_types ::< Str , Bytes > ( ) . put (
2023-11-23 01:21:19 +08:00
wtxn ,
main_key ::WORDS_FST_KEY ,
fst . as_fst ( ) . as_bytes ( ) ,
)
2020-10-21 21:55:48 +08:00
}
2021-04-07 16:53:57 +08:00
/// Returns the FST which is the words dictionary of the engine.
2024-07-09 23:25:39 +08:00
pub fn words_fst < ' t > ( & self , rtxn : & ' t RoTxn < '_ > ) -> Result < fst ::Set < Cow < ' t , [ u8 ] > > > {
2023-11-27 18:52:22 +08:00
match self . main . remap_types ::< Str , Bytes > ( ) . get ( rtxn , main_key ::WORDS_FST_KEY ) ? {
2020-10-25 21:49:04 +08:00
Some ( bytes ) = > Ok ( fst ::Set ::new ( bytes ) ? . map_data ( Cow ::Borrowed ) ? ) ,
None = > Ok ( fst ::Set ::default ( ) . map_data ( Cow ::Owned ) ? ) ,
2020-10-21 21:55:48 +08:00
}
}
2021-03-30 01:15:47 +08:00
/* stop words */
2021-06-17 00:33:33 +08:00
pub ( crate ) fn put_stop_words < A : AsRef < [ u8 ] > > (
& self ,
2024-07-09 23:25:39 +08:00
wtxn : & mut RwTxn < '_ > ,
2021-06-17 00:33:33 +08:00
fst : & fst ::Set < A > ,
) -> heed ::Result < ( ) > {
2023-11-27 18:52:22 +08:00
self . main . remap_types ::< Str , Bytes > ( ) . put (
2023-11-23 01:21:19 +08:00
wtxn ,
main_key ::STOP_WORDS_KEY ,
fst . as_fst ( ) . as_bytes ( ) ,
)
2021-03-30 01:15:47 +08:00
}
2024-07-09 23:25:39 +08:00
pub ( crate ) fn delete_stop_words ( & self , wtxn : & mut RwTxn < '_ > ) -> heed ::Result < bool > {
2023-11-23 01:21:19 +08:00
self . main . remap_key_type ::< Str > ( ) . delete ( wtxn , main_key ::STOP_WORDS_KEY )
2021-03-30 01:15:47 +08:00
}
2021-04-07 16:53:57 +08:00
2024-07-09 23:25:39 +08:00
pub fn stop_words < ' t > ( & self , rtxn : & ' t RoTxn < ' t > ) -> Result < Option < fst ::Set < & ' t [ u8 ] > > > {
2023-11-27 18:52:22 +08:00
match self . main . remap_types ::< Str , Bytes > ( ) . get ( rtxn , main_key ::STOP_WORDS_KEY ) ? {
2021-03-30 01:15:47 +08:00
Some ( bytes ) = > Ok ( Some ( fst ::Set ::new ( bytes ) ? ) ) ,
None = > Ok ( None ) ,
}
}
2023-07-24 23:00:18 +08:00
/* non separator tokens */
pub ( crate ) fn put_non_separator_tokens (
& self ,
2024-07-09 23:25:39 +08:00
wtxn : & mut RwTxn < '_ > ,
2023-07-24 23:00:18 +08:00
set : & BTreeSet < String > ,
) -> heed ::Result < ( ) > {
2023-11-23 01:21:19 +08:00
self . main . remap_types ::< Str , SerdeBincode < _ > > ( ) . put (
wtxn ,
main_key ::NON_SEPARATOR_TOKENS_KEY ,
set ,
)
2023-07-24 23:00:18 +08:00
}
2024-07-09 23:25:39 +08:00
pub ( crate ) fn delete_non_separator_tokens ( & self , wtxn : & mut RwTxn < '_ > ) -> heed ::Result < bool > {
2023-11-23 01:21:19 +08:00
self . main . remap_key_type ::< Str > ( ) . delete ( wtxn , main_key ::NON_SEPARATOR_TOKENS_KEY )
2023-07-24 23:00:18 +08:00
}
2024-07-09 23:25:39 +08:00
pub fn non_separator_tokens ( & self , rtxn : & RoTxn < '_ > ) -> Result < Option < BTreeSet < String > > > {
2023-11-23 01:21:19 +08:00
Ok ( self
. main
. remap_types ::< Str , SerdeBincode < BTreeSet < String > > > ( )
. get ( rtxn , main_key ::NON_SEPARATOR_TOKENS_KEY ) ? )
2023-07-24 23:00:18 +08:00
}
/* separator tokens */
pub ( crate ) fn put_separator_tokens (
& self ,
2024-07-09 23:25:39 +08:00
wtxn : & mut RwTxn < '_ > ,
2023-07-24 23:00:18 +08:00
set : & BTreeSet < String > ,
) -> heed ::Result < ( ) > {
2023-11-23 01:21:19 +08:00
self . main . remap_types ::< Str , SerdeBincode < _ > > ( ) . put (
wtxn ,
main_key ::SEPARATOR_TOKENS_KEY ,
set ,
)
2023-07-24 23:00:18 +08:00
}
2024-07-09 23:25:39 +08:00
pub ( crate ) fn delete_separator_tokens ( & self , wtxn : & mut RwTxn < '_ > ) -> heed ::Result < bool > {
2023-11-23 01:21:19 +08:00
self . main . remap_key_type ::< Str > ( ) . delete ( wtxn , main_key ::SEPARATOR_TOKENS_KEY )
2023-07-24 23:00:18 +08:00
}
2024-07-09 23:25:39 +08:00
pub fn separator_tokens ( & self , rtxn : & RoTxn < '_ > ) -> Result < Option < BTreeSet < String > > > {
2023-07-24 23:00:18 +08:00
Ok ( self
. main
2023-11-23 01:21:19 +08:00
. remap_types ::< Str , SerdeBincode < BTreeSet < String > > > ( )
. get ( rtxn , main_key ::SEPARATOR_TOKENS_KEY ) ? )
2023-07-24 23:00:18 +08:00
}
/* separators easing method */
2024-07-09 23:25:39 +08:00
pub fn allowed_separators ( & self , rtxn : & RoTxn < '_ > ) -> Result < Option < BTreeSet < String > > > {
2023-07-24 23:00:18 +08:00
let default_separators =
charabia ::separators ::DEFAULT_SEPARATORS . iter ( ) . map ( | s | s . to_string ( ) ) ;
let mut separators : Option < BTreeSet < _ > > = None ;
if let Some ( mut separator_tokens ) = self . separator_tokens ( rtxn ) ? {
separator_tokens . extend ( default_separators . clone ( ) ) ;
separators = Some ( separator_tokens ) ;
}
if let Some ( non_separator_tokens ) = self . non_separator_tokens ( rtxn ) ? {
separators = separators
. or_else ( | | Some ( default_separators . collect ( ) ) )
. map ( | separators | & separators - & non_separator_tokens ) ;
}
Ok ( separators )
}
/* dictionary */
pub ( crate ) fn put_dictionary (
& self ,
2024-07-09 23:25:39 +08:00
wtxn : & mut RwTxn < '_ > ,
2023-07-24 23:00:18 +08:00
set : & BTreeSet < String > ,
) -> heed ::Result < ( ) > {
2023-11-23 01:21:19 +08:00
self . main . remap_types ::< Str , SerdeBincode < _ > > ( ) . put ( wtxn , main_key ::DICTIONARY_KEY , set )
2023-07-24 23:00:18 +08:00
}
2024-07-09 23:25:39 +08:00
pub ( crate ) fn delete_dictionary ( & self , wtxn : & mut RwTxn < '_ > ) -> heed ::Result < bool > {
2023-11-23 01:21:19 +08:00
self . main . remap_key_type ::< Str > ( ) . delete ( wtxn , main_key ::DICTIONARY_KEY )
2023-07-24 23:00:18 +08:00
}
2024-07-09 23:25:39 +08:00
pub fn dictionary ( & self , rtxn : & RoTxn < '_ > ) -> Result < Option < BTreeSet < String > > > {
2023-07-24 23:00:18 +08:00
Ok ( self
. main
2023-11-23 01:21:19 +08:00
. remap_types ::< Str , SerdeBincode < BTreeSet < String > > > ( )
. get ( rtxn , main_key ::DICTIONARY_KEY ) ? )
2023-07-24 23:00:18 +08:00
}
2021-04-07 16:53:57 +08:00
/* synonyms */
2021-06-15 19:45:20 +08:00
pub ( crate ) fn put_synonyms (
2021-06-15 17:06:42 +08:00
& self ,
2024-07-09 23:25:39 +08:00
wtxn : & mut RwTxn < '_ > ,
2021-06-15 17:06:42 +08:00
synonyms : & HashMap < Vec < String > , Vec < Vec < String > > > ,
2023-07-26 15:33:42 +08:00
user_defined_synonyms : & BTreeMap < String , Vec < String > > ,
2021-06-17 00:33:33 +08:00
) -> heed ::Result < ( ) > {
2023-11-23 01:21:19 +08:00
self . main . remap_types ::< Str , SerdeBincode < _ > > ( ) . put (
wtxn ,
main_key ::SYNONYMS_KEY ,
synonyms ,
) ? ;
self . main . remap_types ::< Str , SerdeBincode < _ > > ( ) . put (
2023-07-26 15:33:42 +08:00
wtxn ,
main_key ::USER_DEFINED_SYNONYMS_KEY ,
user_defined_synonyms ,
)
2021-04-07 16:53:57 +08:00
}
2024-07-09 23:25:39 +08:00
pub ( crate ) fn delete_synonyms ( & self , wtxn : & mut RwTxn < '_ > ) -> heed ::Result < bool > {
2023-11-23 01:21:19 +08:00
self . main . remap_key_type ::< Str > ( ) . delete ( wtxn , main_key ::SYNONYMS_KEY ) ? ;
self . main . remap_key_type ::< Str > ( ) . delete ( wtxn , main_key ::USER_DEFINED_SYNONYMS_KEY )
2023-07-26 15:33:42 +08:00
}
pub fn user_defined_synonyms (
& self ,
2024-07-09 23:25:39 +08:00
rtxn : & RoTxn < '_ > ,
2023-07-26 15:33:42 +08:00
) -> heed ::Result < BTreeMap < String , Vec < String > > > {
Ok ( self
. main
2023-11-23 01:21:19 +08:00
. remap_types ::< Str , SerdeBincode < _ > > ( )
. get ( rtxn , main_key ::USER_DEFINED_SYNONYMS_KEY ) ?
2023-07-26 15:33:42 +08:00
. unwrap_or_default ( ) )
2021-04-07 16:53:57 +08:00
}
2024-07-09 23:25:39 +08:00
pub fn synonyms (
& self ,
rtxn : & RoTxn < '_ > ,
) -> heed ::Result < HashMap < Vec < String > , Vec < Vec < String > > > > {
2021-06-17 00:33:33 +08:00
Ok ( self
. main
2023-11-23 01:21:19 +08:00
. remap_types ::< Str , SerdeBincode < _ > > ( )
. get ( rtxn , main_key ::SYNONYMS_KEY ) ?
2021-06-17 00:33:33 +08:00
. unwrap_or_default ( ) )
2021-04-07 16:53:57 +08:00
}
2021-06-15 17:06:42 +08:00
pub fn words_synonyms < S : AsRef < str > > (
& self ,
2024-07-09 23:25:39 +08:00
rtxn : & RoTxn < '_ > ,
2021-06-15 17:06:42 +08:00
words : & [ S ] ,
2021-06-17 00:33:33 +08:00
) -> heed ::Result < Option < Vec < Vec < String > > > > {
2021-04-10 03:56:20 +08:00
let words : Vec < _ > = words . iter ( ) . map ( | s | s . as_ref ( ) . to_owned ( ) ) . collect ( ) ;
Ok ( self . synonyms ( rtxn ) ? . remove ( & words ) )
2021-04-07 16:53:57 +08:00
}
2021-02-03 17:30:33 +08:00
/* words prefixes fst */
2024-04-02 19:37:55 +08:00
/// Writes the FST which is the words prefixes dictionary of the engine.
2021-06-17 00:33:33 +08:00
pub ( crate ) fn put_words_prefixes_fst < A : AsRef < [ u8 ] > > (
& self ,
2024-07-09 23:25:39 +08:00
wtxn : & mut RwTxn < '_ > ,
2021-06-17 00:33:33 +08:00
fst : & fst ::Set < A > ,
) -> heed ::Result < ( ) > {
2023-11-27 18:52:22 +08:00
self . main . remap_types ::< Str , Bytes > ( ) . put (
2021-06-17 00:33:33 +08:00
wtxn ,
main_key ::WORDS_PREFIXES_FST_KEY ,
fst . as_fst ( ) . as_bytes ( ) ,
)
2021-02-03 17:30:33 +08:00
}
2024-04-02 19:37:55 +08:00
/// Returns the FST which is the words prefixes dictionary of the engine.
2024-07-09 23:25:39 +08:00
pub fn words_prefixes_fst < ' t > ( & self , rtxn : & ' t RoTxn < ' t > ) -> Result < fst ::Set < Cow < ' t , [ u8 ] > > > {
2023-11-27 18:52:22 +08:00
match self . main . remap_types ::< Str , Bytes > ( ) . get ( rtxn , main_key ::WORDS_PREFIXES_FST_KEY ) ? {
2021-02-03 17:30:33 +08:00
Some ( bytes ) = > Ok ( fst ::Set ::new ( bytes ) ? . map_data ( Cow ::Borrowed ) ? ) ,
None = > Ok ( fst ::Set ::default ( ) . map_data ( Cow ::Owned ) ? ) ,
}
}
2021-02-18 21:35:14 +08:00
/* word documents count */
/// Returns the number of documents ids associated with the given word,
/// it is much faster than deserializing the bitmap and getting the length of it.
2024-07-09 23:25:39 +08:00
pub fn word_documents_count ( & self , rtxn : & RoTxn < '_ > , word : & str ) -> heed ::Result < Option < u64 > > {
2021-02-18 21:59:37 +08:00
self . word_docids . remap_data_type ::< RoaringBitmapLenCodec > ( ) . get ( rtxn , word )
2021-02-18 21:35:14 +08:00
}
2021-02-03 17:30:33 +08:00
/* documents */
2023-03-08 16:44:09 +08:00
/// Returns an iterator over the requested documents. The next item will be an error if a document is missing.
pub fn iter_documents < ' a , ' t : ' a > (
& ' a self ,
2024-07-09 23:25:39 +08:00
rtxn : & ' t RoTxn < ' t > ,
2023-03-08 16:44:09 +08:00
ids : impl IntoIterator < Item = DocumentId > + ' a ,
) -> Result < impl Iterator < Item = Result < ( DocumentId , obkv ::KvReaderU16 < ' t > ) > > + ' a > {
Ok ( ids . into_iter ( ) . map ( move | id | {
2021-06-17 00:33:33 +08:00
let kv = self
. documents
2023-11-23 01:21:19 +08:00
. get ( rtxn , & id ) ?
2022-10-25 03:34:13 +08:00
. ok_or ( UserError ::UnknownInternalDocumentId { document_id : id } ) ? ;
2023-03-08 16:44:09 +08:00
Ok ( ( id , kv ) )
} ) )
2020-10-21 21:55:48 +08:00
}
2023-03-08 16:44:09 +08:00
/// Returns a [`Vec`] of the requested documents. Returns an error if a document is missing.
pub fn documents < ' t > (
2021-05-04 17:23:51 +08:00
& self ,
2024-07-09 23:25:39 +08:00
rtxn : & ' t RoTxn < ' t > ,
2023-03-08 16:44:09 +08:00
ids : impl IntoIterator < Item = DocumentId > ,
) -> Result < Vec < ( DocumentId , obkv ::KvReaderU16 < ' t > ) > > {
self . iter_documents ( rtxn , ids ) ? . collect ( )
}
2022-06-13 23:59:34 +08:00
2023-03-08 16:44:09 +08:00
/// Returns an iterator over all the documents in the index.
pub fn all_documents < ' a , ' t : ' a > (
& ' a self ,
2024-07-09 23:25:39 +08:00
rtxn : & ' t RoTxn < ' t > ,
2023-03-08 16:44:09 +08:00
) -> Result < impl Iterator < Item = Result < ( DocumentId , obkv ::KvReaderU16 < ' t > ) > > + ' a > {
self . iter_documents ( rtxn , self . documents_ids ( rtxn ) ? )
2021-05-04 17:23:51 +08:00
}
2023-11-09 21:22:43 +08:00
pub fn external_id_of < ' a , ' t : ' a > (
& ' a self ,
2024-07-09 23:25:39 +08:00
rtxn : & ' t RoTxn < ' t > ,
2023-11-09 21:22:43 +08:00
ids : impl IntoIterator < Item = DocumentId > + ' a ,
) -> Result < impl IntoIterator < Item = Result < String > > + ' a > {
let fields = self . fields_ids_map ( rtxn ) ? ;
// uses precondition "never called on an empty index"
let primary_key = self . primary_key ( rtxn ) ? . ok_or ( InternalError ::DatabaseMissingEntry {
db_name : db_name ::MAIN ,
key : Some ( main_key ::PRIMARY_KEY_KEY ) ,
} ) ? ;
let primary_key = PrimaryKey ::new ( primary_key , & fields ) . ok_or_else ( | | {
InternalError ::FieldIdMapMissingEntry ( crate ::FieldIdMapMissingEntry ::FieldName {
field_name : primary_key . to_owned ( ) ,
process : " external_id_of " ,
} )
} ) ? ;
Ok ( self . iter_documents ( rtxn , ids ) ? . map ( move | entry | -> Result < _ > {
let ( _docid , obkv ) = entry ? ;
match primary_key . document_id ( & obkv , & fields ) ? {
Ok ( document_id ) = > Ok ( document_id ) ,
Err ( _ ) = > Err ( InternalError ::DocumentsError (
crate ::documents ::Error ::InvalidDocumentFormat ,
)
. into ( ) ) ,
}
} ) )
}
2024-07-09 23:25:39 +08:00
pub fn facets_distribution < ' a > ( & ' a self , rtxn : & ' a RoTxn < ' a > ) -> FacetDistribution < ' a > {
2020-12-29 02:08:53 +08:00
FacetDistribution ::new ( rtxn , self )
}
2024-07-09 23:25:39 +08:00
pub fn search < ' a > ( & ' a self , rtxn : & ' a RoTxn < ' a > ) -> Search < ' a > {
2020-10-21 21:55:48 +08:00
Search ::new ( rtxn , self )
}
2021-03-12 01:32:04 +08:00
/// Returns the index creation time.
2024-07-09 23:25:39 +08:00
pub fn created_at ( & self , rtxn : & RoTxn < '_ > ) -> Result < OffsetDateTime > {
2021-06-17 00:33:33 +08:00
Ok ( self
. main
2023-11-23 01:21:19 +08:00
. remap_types ::< Str , SerdeJson < OffsetDateTime > > ( )
. get ( rtxn , main_key ::CREATED_AT_KEY ) ?
2021-06-15 17:51:32 +08:00
. ok_or ( InternalError ::DatabaseMissingEntry {
db_name : db_name ::MAIN ,
key : Some ( main_key ::CREATED_AT_KEY ) ,
} ) ? )
2021-03-12 01:32:04 +08:00
}
2021-03-12 01:42:21 +08:00
/// Returns the index last updated time.
2024-07-09 23:25:39 +08:00
pub fn updated_at ( & self , rtxn : & RoTxn < '_ > ) -> Result < OffsetDateTime > {
2021-06-17 00:33:33 +08:00
Ok ( self
. main
2023-11-23 01:21:19 +08:00
. remap_types ::< Str , SerdeJson < OffsetDateTime > > ( )
. get ( rtxn , main_key ::UPDATED_AT_KEY ) ?
2021-06-15 17:51:32 +08:00
. ok_or ( InternalError ::DatabaseMissingEntry {
db_name : db_name ::MAIN ,
key : Some ( main_key ::UPDATED_AT_KEY ) ,
} ) ? )
2021-03-12 01:32:04 +08:00
}
2021-06-17 00:33:33 +08:00
pub ( crate ) fn set_updated_at (
& self ,
2024-07-09 23:25:39 +08:00
wtxn : & mut RwTxn < '_ > ,
2022-02-15 18:41:55 +08:00
time : & OffsetDateTime ,
2021-06-17 00:33:33 +08:00
) -> heed ::Result < ( ) > {
2023-11-23 01:21:19 +08:00
self . main . remap_types ::< Str , SerdeJson < OffsetDateTime > > ( ) . put (
wtxn ,
main_key ::UPDATED_AT_KEY ,
time ,
)
2021-03-12 01:32:04 +08:00
}
2022-03-16 17:03:18 +08:00
2024-07-09 23:25:39 +08:00
pub fn authorize_typos ( & self , txn : & RoTxn < '_ > ) -> heed ::Result < bool > {
2022-03-16 17:03:18 +08:00
// It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We
// identify 0 as being false, and anything else as true. The absence of a value is true,
// because by default, we authorize typos.
2023-11-27 18:52:22 +08:00
match self . main . remap_types ::< Str , U8 > ( ) . get ( txn , main_key ::AUTHORIZE_TYPOS ) ? {
2022-03-16 17:03:18 +08:00
Some ( 0 ) = > Ok ( false ) ,
_ = > Ok ( true ) ,
}
}
2024-07-09 23:25:39 +08:00
pub ( crate ) fn put_authorize_typos ( & self , txn : & mut RwTxn < '_ > , flag : bool ) -> heed ::Result < ( ) > {
2022-03-16 17:03:18 +08:00
// It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We
// identify 0 as being false, and anything else as true. The absence of a value is true,
// because by default, we authorize typos.
2023-11-27 18:52:22 +08:00
self . main . remap_types ::< Str , U8 > ( ) . put ( txn , main_key ::AUTHORIZE_TYPOS , & ( flag as u8 ) ) ? ;
2022-03-16 17:03:18 +08:00
Ok ( ( ) )
}
2022-03-21 20:03:06 +08:00
2024-07-09 23:25:39 +08:00
pub fn min_word_len_one_typo ( & self , txn : & RoTxn < '_ > ) -> heed ::Result < u8 > {
2022-03-21 20:03:06 +08:00
// It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We
// identify 0 as being false, and anything else as true. The absence of a value is true,
// because by default, we authorize typos.
Ok ( self
. main
2023-11-27 18:52:22 +08:00
. remap_types ::< Str , U8 > ( )
2023-11-23 01:21:19 +08:00
. get ( txn , main_key ::ONE_TYPO_WORD_LEN ) ?
2022-04-01 00:23:12 +08:00
. unwrap_or ( DEFAULT_MIN_WORD_LEN_ONE_TYPO ) )
2022-03-21 20:03:06 +08:00
}
2024-07-09 23:25:39 +08:00
pub ( crate ) fn put_min_word_len_one_typo (
& self ,
txn : & mut RwTxn < '_ > ,
val : u8 ,
) -> heed ::Result < ( ) > {
2022-03-21 20:03:06 +08:00
// It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We
// identify 0 as being false, and anything else as true. The absence of a value is true,
// because by default, we authorize typos.
2023-11-27 18:52:22 +08:00
self . main . remap_types ::< Str , U8 > ( ) . put ( txn , main_key ::ONE_TYPO_WORD_LEN , & val ) ? ;
2022-03-21 20:03:06 +08:00
Ok ( ( ) )
}
2024-07-09 23:25:39 +08:00
pub fn min_word_len_two_typos ( & self , txn : & RoTxn < '_ > ) -> heed ::Result < u8 > {
2022-03-21 20:03:06 +08:00
// It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We
// identify 0 as being false, and anything else as true. The absence of a value is true,
// because by default, we authorize typos.
Ok ( self
. main
2023-11-27 18:52:22 +08:00
. remap_types ::< Str , U8 > ( )
2023-11-23 01:21:19 +08:00
. get ( txn , main_key ::TWO_TYPOS_WORD_LEN ) ?
2022-04-01 00:23:12 +08:00
. unwrap_or ( DEFAULT_MIN_WORD_LEN_TWO_TYPOS ) )
2022-03-21 20:03:06 +08:00
}
2024-07-09 23:25:39 +08:00
pub ( crate ) fn put_min_word_len_two_typos (
& self ,
txn : & mut RwTxn < '_ > ,
val : u8 ,
) -> heed ::Result < ( ) > {
2022-03-21 20:03:06 +08:00
// It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We
// identify 0 as being false, and anything else as true. The absence of a value is true,
// because by default, we authorize typos.
2023-11-27 18:52:22 +08:00
self . main . remap_types ::< Str , U8 > ( ) . put ( txn , main_key ::TWO_TYPOS_WORD_LEN , & val ) ? ;
2022-03-21 20:03:06 +08:00
Ok ( ( ) )
}
2022-03-21 21:03:31 +08:00
/// List the words on which typo are not allowed
2024-07-09 23:25:39 +08:00
pub fn exact_words < ' t > ( & self , txn : & ' t RoTxn < ' t > ) -> Result < Option < fst ::Set < Cow < ' t , [ u8 ] > > > > {
2023-11-27 18:52:22 +08:00
match self . main . remap_types ::< Str , Bytes > ( ) . get ( txn , main_key ::EXACT_WORDS ) ? {
2022-05-24 15:15:49 +08:00
Some ( bytes ) = > Ok ( Some ( fst ::Set ::new ( bytes ) ? . map_data ( Cow ::Borrowed ) ? ) ) ,
None = > Ok ( None ) ,
2022-03-21 21:03:31 +08:00
}
}
pub ( crate ) fn put_exact_words < A : AsRef < [ u8 ] > > (
& self ,
2024-07-09 23:25:39 +08:00
txn : & mut RwTxn < '_ > ,
2022-03-21 21:03:31 +08:00
words : & fst ::Set < A > ,
) -> Result < ( ) > {
2023-11-27 18:52:22 +08:00
self . main . remap_types ::< Str , Bytes > ( ) . put (
2022-03-21 21:03:31 +08:00
txn ,
main_key ::EXACT_WORDS ,
words . as_fst ( ) . as_bytes ( ) ,
) ? ;
Ok ( ( ) )
}
2022-03-23 02:07:59 +08:00
2022-04-05 20:10:22 +08:00
/// Returns the exact attributes: attributes for which typo is disallowed.
2024-07-09 23:25:39 +08:00
pub fn exact_attributes < ' t > ( & self , txn : & ' t RoTxn < ' t > ) -> Result < Vec < & ' t str > > {
2022-03-23 02:07:59 +08:00
Ok ( self
. main
2023-11-23 01:21:19 +08:00
. remap_types ::< Str , SerdeBincode < Vec < & str > > > ( )
. get ( txn , main_key ::EXACT_ATTRIBUTES ) ?
2022-03-23 02:07:59 +08:00
. unwrap_or_default ( ) )
}
2022-03-30 22:07:59 +08:00
2022-04-05 20:10:22 +08:00
/// Returns the list of exact attributes field ids.
2024-07-09 23:25:39 +08:00
pub fn exact_attributes_ids ( & self , txn : & RoTxn < '_ > ) -> Result < HashSet < FieldId > > {
2022-03-25 00:00:29 +08:00
let attrs = self . exact_attributes ( txn ) ? ;
let fid_map = self . fields_ids_map ( txn ) ? ;
Ok ( attrs . iter ( ) . filter_map ( | attr | fid_map . id ( attr ) ) . collect ( ) )
}
2022-03-23 02:07:59 +08:00
2022-04-05 20:10:22 +08:00
/// Writes the exact attributes to the database.
2024-07-09 23:25:39 +08:00
pub ( crate ) fn put_exact_attributes ( & self , txn : & mut RwTxn < '_ > , attrs : & [ & str ] ) -> Result < ( ) > {
2023-11-23 01:21:19 +08:00
self . main . remap_types ::< Str , SerdeBincode < & [ & str ] > > ( ) . put (
txn ,
main_key ::EXACT_ATTRIBUTES ,
& attrs ,
) ? ;
2022-03-23 02:07:59 +08:00
Ok ( ( ) )
}
2022-04-05 20:10:22 +08:00
/// Clears the exact attributes from the store.
2024-07-09 23:25:39 +08:00
pub ( crate ) fn delete_exact_attributes ( & self , txn : & mut RwTxn < '_ > ) -> heed ::Result < bool > {
2023-11-23 01:21:19 +08:00
self . main . remap_key_type ::< Str > ( ) . delete ( txn , main_key ::EXACT_ATTRIBUTES )
2022-03-23 02:07:59 +08:00
}
2022-06-08 23:28:23 +08:00
2024-07-09 23:25:39 +08:00
pub fn max_values_per_facet ( & self , txn : & RoTxn < '_ > ) -> heed ::Result < Option < u64 > > {
2023-11-27 18:52:22 +08:00
self . main . remap_types ::< Str , BEU64 > ( ) . get ( txn , main_key ::MAX_VALUES_PER_FACET )
2022-06-08 23:28:23 +08:00
}
2024-07-09 23:25:39 +08:00
pub ( crate ) fn put_max_values_per_facet (
& self ,
txn : & mut RwTxn < '_ > ,
val : u64 ,
) -> heed ::Result < ( ) > {
2023-11-27 18:52:22 +08:00
self . main . remap_types ::< Str , BEU64 > ( ) . put ( txn , main_key ::MAX_VALUES_PER_FACET , & val )
2022-06-08 23:28:23 +08:00
}
2024-07-09 23:25:39 +08:00
pub ( crate ) fn delete_max_values_per_facet ( & self , txn : & mut RwTxn < '_ > ) -> heed ::Result < bool > {
2023-11-23 01:21:19 +08:00
self . main . remap_key_type ::< Str > ( ) . delete ( txn , main_key ::MAX_VALUES_PER_FACET )
2022-06-08 23:28:23 +08:00
}
2022-06-08 23:31:21 +08:00
2024-07-09 23:25:39 +08:00
pub fn sort_facet_values_by ( & self , txn : & RoTxn < '_ > ) -> heed ::Result < OrderByMap > {
2024-03-12 18:01:46 +08:00
let orders = self
2023-06-22 23:13:40 +08:00
. main
2024-03-12 18:01:46 +08:00
. remap_types ::< Str , SerdeJson < OrderByMap > > ( )
2023-11-23 01:21:19 +08:00
. get ( txn , main_key ::SORT_FACET_VALUES_BY ) ?
2023-06-22 23:13:40 +08:00
. unwrap_or_default ( ) ;
Ok ( orders )
}
pub ( crate ) fn put_sort_facet_values_by (
& self ,
2024-07-09 23:25:39 +08:00
txn : & mut RwTxn < '_ > ,
2024-03-12 18:01:46 +08:00
val : & OrderByMap ,
2023-06-22 23:13:40 +08:00
) -> heed ::Result < ( ) > {
2023-11-23 01:21:19 +08:00
self . main . remap_types ::< Str , SerdeJson < _ > > ( ) . put ( txn , main_key ::SORT_FACET_VALUES_BY , & val )
2023-06-22 23:13:40 +08:00
}
2024-07-09 23:25:39 +08:00
pub ( crate ) fn delete_sort_facet_values_by ( & self , txn : & mut RwTxn < '_ > ) -> heed ::Result < bool > {
2023-11-23 01:21:19 +08:00
self . main . remap_key_type ::< Str > ( ) . delete ( txn , main_key ::SORT_FACET_VALUES_BY )
2023-06-22 23:13:40 +08:00
}
2024-07-09 23:25:39 +08:00
pub fn pagination_max_total_hits ( & self , txn : & RoTxn < '_ > ) -> heed ::Result < Option < u64 > > {
2023-11-27 18:52:22 +08:00
self . main . remap_types ::< Str , BEU64 > ( ) . get ( txn , main_key ::PAGINATION_MAX_TOTAL_HITS )
2022-06-08 23:31:21 +08:00
}
2022-06-22 18:00:45 +08:00
pub ( crate ) fn put_pagination_max_total_hits (
2022-06-08 23:31:21 +08:00
& self ,
2024-07-09 23:25:39 +08:00
txn : & mut RwTxn < '_ > ,
2023-11-27 18:52:22 +08:00
val : u64 ,
2022-06-08 23:31:21 +08:00
) -> heed ::Result < ( ) > {
2023-11-27 18:52:22 +08:00
self . main . remap_types ::< Str , BEU64 > ( ) . put ( txn , main_key ::PAGINATION_MAX_TOTAL_HITS , & val )
2022-06-08 23:31:21 +08:00
}
2024-07-09 23:25:39 +08:00
pub ( crate ) fn delete_pagination_max_total_hits (
& self ,
txn : & mut RwTxn < '_ > ,
) -> heed ::Result < bool > {
2023-11-23 01:21:19 +08:00
self . main . remap_key_type ::< Str > ( ) . delete ( txn , main_key ::PAGINATION_MAX_TOTAL_HITS )
2022-06-08 23:31:21 +08:00
}
2022-10-15 03:05:53 +08:00
2024-07-09 23:25:39 +08:00
pub fn proximity_precision ( & self , txn : & RoTxn < '_ > ) -> heed ::Result < Option < ProximityPrecision > > {
2023-12-06 22:49:02 +08:00
self . main
. remap_types ::< Str , SerdeBincode < ProximityPrecision > > ( )
. get ( txn , main_key ::PROXIMITY_PRECISION )
}
pub ( crate ) fn put_proximity_precision (
& self ,
2024-07-09 23:25:39 +08:00
txn : & mut RwTxn < '_ > ,
2023-12-06 22:49:02 +08:00
val : ProximityPrecision ,
) -> heed ::Result < ( ) > {
self . main . remap_types ::< Str , SerdeBincode < ProximityPrecision > > ( ) . put (
txn ,
main_key ::PROXIMITY_PRECISION ,
& val ,
)
}
2024-07-09 23:25:39 +08:00
pub ( crate ) fn delete_proximity_precision ( & self , txn : & mut RwTxn < '_ > ) -> heed ::Result < bool > {
2023-12-06 22:49:02 +08:00
self . main . remap_key_type ::< Str > ( ) . delete ( txn , main_key ::PROXIMITY_PRECISION )
}
2022-10-15 03:05:53 +08:00
/* script language docids */
/// Retrieve all the documents ids that correspond with (Script, Language) key, `None` if it is any.
2022-10-17 19:51:04 +08:00
pub fn script_language_documents_ids (
& self ,
2024-07-09 23:25:39 +08:00
rtxn : & RoTxn < '_ > ,
2022-10-17 19:51:04 +08:00
key : & ( Script , Language ) ,
) -> heed ::Result < Option < RoaringBitmap > > {
2023-11-06 18:19:31 +08:00
self . script_language_docids . get ( rtxn , key )
2022-10-15 03:05:53 +08:00
}
2023-02-02 01:57:43 +08:00
2024-07-09 23:25:39 +08:00
pub fn script_language (
& self ,
rtxn : & RoTxn < '_ > ,
) -> heed ::Result < HashMap < Script , Vec < Language > > > {
2023-02-02 01:57:43 +08:00
let mut script_language : HashMap < Script , Vec < Language > > = HashMap ::new ( ) ;
2023-03-08 02:38:01 +08:00
let mut script_language_doc_count : Vec < ( Script , Language , u64 ) > = Vec ::new ( ) ;
let mut total = 0 ;
2023-02-02 01:57:43 +08:00
for sl in self . script_language_docids . iter ( rtxn ) ? {
let ( ( script , language ) , docids ) = sl ? ;
// keep only Languages that contains at least 1 document.
2023-10-25 20:14:15 +08:00
let remaining_documents_count = docids . len ( ) ;
2023-03-08 02:38:01 +08:00
total + = remaining_documents_count ;
if remaining_documents_count > 0 {
script_language_doc_count . push ( ( script , language , remaining_documents_count ) ) ;
}
}
2023-03-09 17:56:17 +08:00
let threshold = total / 20 ; // 5% (arbitrary)
2023-03-08 02:38:01 +08:00
for ( script , language , count ) in script_language_doc_count {
if count > threshold {
2023-02-02 01:57:43 +08:00
if let Some ( languages ) = script_language . get_mut ( & script ) {
( * languages ) . push ( language ) ;
} else {
script_language . insert ( script , vec! [ language ] ) ;
}
}
}
Ok ( script_language )
}
2023-11-15 22:46:37 +08:00
2024-07-23 20:59:31 +08:00
pub fn languages ( & self , rtxn : & RoTxn < '_ > ) -> heed ::Result < Vec < Language > > {
let mut script_language_doc_count : Vec < ( Language , u64 ) > = Vec ::new ( ) ;
let mut total = 0 ;
for sl in self . script_language_docids . iter ( rtxn ) ? {
let ( ( _script , language ) , docids ) = sl ? ;
// keep only Languages that contains at least 1 document.
let remaining_documents_count = docids . len ( ) ;
total + = remaining_documents_count ;
if remaining_documents_count > 0 {
script_language_doc_count . push ( ( language , remaining_documents_count ) ) ;
}
}
let threshold = total / 20 ; // 5% (arbitrary)
Ok ( script_language_doc_count
. into_iter ( )
. filter ( | ( _ , count ) | * count > threshold )
. map ( | ( language , _ ) | language )
. collect ( ) )
}
2024-05-22 21:27:09 +08:00
/// Put the embedding configs:
/// 1. The name of the embedder
/// 2. The configuration option for this embedder
/// 3. The list of documents with a user provided embedding
2023-11-15 22:46:37 +08:00
pub ( crate ) fn put_embedding_configs (
& self ,
wtxn : & mut RwTxn < '_ > ,
2024-05-30 17:50:30 +08:00
configs : Vec < IndexEmbeddingConfig > ,
2023-11-15 22:46:37 +08:00
) -> heed ::Result < ( ) > {
2024-05-30 17:50:30 +08:00
self . main . remap_types ::< Str , SerdeJson < Vec < IndexEmbeddingConfig > > > ( ) . put (
wtxn ,
main_key ::EMBEDDING_CONFIGS ,
& configs ,
)
2023-11-15 22:46:37 +08:00
}
pub ( crate ) fn delete_embedding_configs ( & self , wtxn : & mut RwTxn < '_ > ) -> heed ::Result < bool > {
self . main . remap_key_type ::< Str > ( ) . delete ( wtxn , main_key ::EMBEDDING_CONFIGS )
}
2024-05-30 17:50:30 +08:00
pub fn embedding_configs ( & self , rtxn : & RoTxn < '_ > ) -> Result < Vec < IndexEmbeddingConfig > > {
2023-11-15 22:46:37 +08:00
Ok ( self
. main
2024-05-30 17:50:30 +08:00
. remap_types ::< Str , SerdeJson < Vec < IndexEmbeddingConfig > > > ( )
2023-11-15 22:46:37 +08:00
. get ( rtxn , main_key ::EMBEDDING_CONFIGS ) ?
. unwrap_or_default ( ) )
}
2023-12-13 22:38:44 +08:00
2024-05-28 20:22:19 +08:00
pub fn arroy_readers < ' a > (
& ' a self ,
rtxn : & ' a RoTxn < ' a > ,
embedder_id : u8 ,
2024-07-09 23:25:39 +08:00
) -> impl Iterator < Item = Result < arroy ::Reader < ' a , arroy ::distances ::Angular > > > + ' a {
2024-05-28 20:22:19 +08:00
crate ::vector ::arroy_db_range_for_embedder ( embedder_id ) . map_while ( move | k | {
arroy ::Reader ::open ( rtxn , k , self . vector_arroy )
. map ( Some )
. or_else ( | e | match e {
2024-06-20 21:59:32 +08:00
arroy ::Error ::MissingMetadata ( _ ) = > Ok ( None ) ,
2024-05-28 20:22:19 +08:00
e = > Err ( e . into ( ) ) ,
} )
. transpose ( )
} )
}
2024-03-12 01:24:21 +08:00
pub ( crate ) fn put_search_cutoff ( & self , wtxn : & mut RwTxn < '_ > , cutoff : u64 ) -> heed ::Result < ( ) > {
self . main . remap_types ::< Str , BEU64 > ( ) . put ( wtxn , main_key ::SEARCH_CUTOFF , & cutoff )
}
pub fn search_cutoff ( & self , rtxn : & RoTxn < '_ > ) -> Result < Option < u64 > > {
Ok ( self . main . remap_types ::< Str , BEU64 > ( ) . get ( rtxn , main_key ::SEARCH_CUTOFF ) ? )
}
pub ( crate ) fn delete_search_cutoff ( & self , wtxn : & mut RwTxn < '_ > ) -> heed ::Result < bool > {
self . main . remap_key_type ::< Str > ( ) . delete ( wtxn , main_key ::SEARCH_CUTOFF )
}
2024-05-14 17:38:28 +08:00
pub fn embeddings (
& self ,
rtxn : & RoTxn < '_ > ,
docid : DocumentId ,
2024-05-22 18:26:00 +08:00
) -> Result < BTreeMap < String , Vec < Embedding > > > {
2024-05-14 17:38:28 +08:00
let mut res = BTreeMap ::new ( ) ;
for row in self . embedder_category_id . iter ( rtxn ) ? {
let ( embedder_name , embedder_id ) = row ? ;
let embedder_id = ( embedder_id as u16 ) < < 8 ;
let mut embeddings = Vec ::new ( ) ;
' vectors : for i in 0 ..= u8 ::MAX {
let reader = arroy ::Reader ::open ( rtxn , embedder_id | ( i as u16 ) , self . vector_arroy )
. map ( Some )
. or_else ( | e | match e {
2024-06-20 21:59:32 +08:00
arroy ::Error ::MissingMetadata ( _ ) = > Ok ( None ) ,
2024-05-14 17:38:28 +08:00
e = > Err ( e ) ,
} )
. transpose ( ) ;
let Some ( reader ) = reader else {
break 'vectors ;
} ;
let embedding = reader ? . item_vector ( rtxn , docid ) ? ;
if let Some ( embedding ) = embedding {
embeddings . push ( embedding )
} else {
break 'vectors ;
}
}
2024-07-15 21:05:56 +08:00
res . insert ( embedder_name . to_owned ( ) , embeddings ) ;
2024-05-14 17:38:28 +08:00
}
Ok ( res )
}
2020-10-21 21:55:48 +08:00
}
2021-03-31 23:14:23 +08:00
2024-05-30 17:50:30 +08:00
#[ derive(Debug, Deserialize, Serialize) ]
pub struct IndexEmbeddingConfig {
pub name : String ,
pub config : EmbeddingConfig ,
2024-06-05 21:38:49 +08:00
pub user_provided : RoaringBitmap ,
2024-05-30 17:50:30 +08:00
}
2021-03-31 23:14:23 +08:00
#[ cfg(test) ]
2021-04-15 21:29:37 +08:00
pub ( crate ) mod tests {
2022-12-20 17:37:50 +08:00
use std ::collections ::HashSet ;
2021-04-15 21:29:37 +08:00
use std ::ops ::Deref ;
2022-05-16 21:22:52 +08:00
use big_s ::S ;
2022-08-02 21:13:06 +08:00
use heed ::{ EnvOpenOptions , RwTxn } ;
2024-05-21 23:08:45 +08:00
use maplit ::{ btreemap , hashset } ;
2021-04-15 21:29:37 +08:00
use tempfile ::TempDir ;
2021-03-31 23:14:23 +08:00
2022-08-02 21:13:06 +08:00
use crate ::documents ::DocumentsBatchReader ;
2022-08-30 23:17:50 +08:00
use crate ::error ::{ Error , InternalError } ;
2022-04-01 00:23:12 +08:00
use crate ::index ::{ DEFAULT_MIN_WORD_LEN_ONE_TYPO , DEFAULT_MIN_WORD_LEN_TWO_TYPOS } ;
2022-12-05 17:26:53 +08:00
use crate ::update ::{
2024-05-21 23:08:45 +08:00
self , IndexDocuments , IndexDocumentsConfig , IndexDocumentsMethod , IndexerConfig , Setting ,
Settings ,
2022-12-05 17:26:53 +08:00
} ;
2024-05-21 23:08:45 +08:00
use crate ::vector ::settings ::{ EmbedderSource , EmbeddingSettings } ;
2023-02-03 01:19:56 +08:00
use crate ::{ db_snap , obkv_to_json , Filter , Index , Search , SearchResult } ;
2021-03-31 23:14:23 +08:00
2021-04-15 21:29:37 +08:00
pub ( crate ) struct TempIndex {
2022-08-02 21:13:06 +08:00
pub inner : Index ,
pub indexer_config : IndexerConfig ,
pub index_documents_config : IndexDocumentsConfig ,
2021-04-15 21:29:37 +08:00
_tempdir : TempDir ,
}
impl Deref for TempIndex {
type Target = Index ;
fn deref ( & self ) -> & Self ::Target {
& self . inner
}
}
impl TempIndex {
2022-08-02 21:13:06 +08:00
/// Creates a temporary index
pub fn new_with_map_size ( size : usize ) -> Self {
2021-04-15 21:29:37 +08:00
let mut options = EnvOpenOptions ::new ( ) ;
2022-08-02 21:13:06 +08:00
options . map_size ( size ) ;
2021-04-15 21:29:37 +08:00
let _tempdir = TempDir ::new_in ( " . " ) . unwrap ( ) ;
let inner = Index ::new ( options , _tempdir . path ( ) ) . unwrap ( ) ;
2022-08-02 21:13:06 +08:00
let indexer_config = IndexerConfig ::default ( ) ;
let index_documents_config = IndexDocumentsConfig ::default ( ) ;
Self { inner , indexer_config , index_documents_config , _tempdir }
}
2023-05-03 20:44:48 +08:00
/// Creates a temporary index, with a default `4096 * 2000` size. This should be enough for
2022-08-02 21:13:06 +08:00
/// most tests.
pub fn new ( ) -> Self {
2023-05-03 20:44:48 +08:00
Self ::new_with_map_size ( 4096 * 2000 )
2022-08-02 21:13:06 +08:00
}
pub fn add_documents_using_wtxn < ' t , R > (
& ' t self ,
2023-11-23 19:07:35 +08:00
wtxn : & mut RwTxn < ' t > ,
2022-08-02 21:13:06 +08:00
documents : DocumentsBatchReader < R > ,
) -> Result < ( ) , crate ::error ::Error >
where
R : std ::io ::Read + std ::io ::Seek ,
{
let builder = IndexDocuments ::new (
wtxn ,
2022-10-10 21:28:03 +08:00
self ,
2022-08-02 21:13:06 +08:00
& self . indexer_config ,
self . index_documents_config . clone ( ) ,
| _ | ( ) ,
2022-10-05 23:41:07 +08:00
| | false ,
2022-08-02 21:13:06 +08:00
)
. unwrap ( ) ;
let ( builder , user_error ) = builder . add_documents ( documents ) . unwrap ( ) ;
user_error ? ;
builder . execute ( ) ? ;
Ok ( ( ) )
}
pub fn add_documents < R > (
& self ,
documents : DocumentsBatchReader < R > ,
) -> Result < ( ) , crate ::error ::Error >
where
R : std ::io ::Read + std ::io ::Seek ,
{
let mut wtxn = self . write_txn ( ) . unwrap ( ) ;
self . add_documents_using_wtxn ( & mut wtxn , documents ) ? ;
wtxn . commit ( ) . unwrap ( ) ;
Ok ( ( ) )
}
pub fn update_settings (
& self ,
2024-07-09 23:25:39 +08:00
update : impl Fn ( & mut Settings < '_ , '_ , '_ > ) ,
2022-08-02 21:13:06 +08:00
) -> Result < ( ) , crate ::error ::Error > {
let mut wtxn = self . write_txn ( ) . unwrap ( ) ;
self . update_settings_using_wtxn ( & mut wtxn , update ) ? ;
wtxn . commit ( ) . unwrap ( ) ;
Ok ( ( ) )
}
pub fn update_settings_using_wtxn < ' t > (
& ' t self ,
2023-11-23 19:07:35 +08:00
wtxn : & mut RwTxn < ' t > ,
2024-07-09 23:25:39 +08:00
update : impl Fn ( & mut Settings < '_ , '_ , '_ > ) ,
2022-08-02 21:13:06 +08:00
) -> Result < ( ) , crate ::error ::Error > {
let mut builder = update ::Settings ::new ( wtxn , & self . inner , & self . indexer_config ) ;
update ( & mut builder ) ;
2022-10-05 23:41:07 +08:00
builder . execute ( drop , | | false ) ? ;
2022-08-02 21:13:06 +08:00
Ok ( ( ) )
2021-04-15 21:29:37 +08:00
}
2022-12-19 16:47:29 +08:00
2023-10-26 18:15:55 +08:00
pub fn delete_documents_using_wtxn < ' t > (
& ' t self ,
2023-11-23 19:07:35 +08:00
wtxn : & mut RwTxn < ' t > ,
2023-10-26 18:15:55 +08:00
external_document_ids : Vec < String > ,
) {
2023-10-25 19:40:46 +08:00
let builder = IndexDocuments ::new (
2023-10-26 18:15:55 +08:00
wtxn ,
2023-10-25 19:40:46 +08:00
self ,
& self . indexer_config ,
self . index_documents_config . clone ( ) ,
| _ | ( ) ,
| | false ,
)
. unwrap ( ) ;
2023-10-25 20:42:09 +08:00
let ( builder , user_error ) = builder . remove_documents ( external_document_ids ) . unwrap ( ) ;
2023-10-25 19:40:46 +08:00
user_error . unwrap ( ) ;
builder . execute ( ) . unwrap ( ) ;
2023-10-26 18:15:55 +08:00
}
pub fn delete_documents ( & self , external_document_ids : Vec < String > ) {
let mut wtxn = self . write_txn ( ) . unwrap ( ) ;
self . delete_documents_using_wtxn ( & mut wtxn , external_document_ids ) ;
2022-12-19 16:47:29 +08:00
wtxn . commit ( ) . unwrap ( ) ;
}
2023-10-25 20:42:09 +08:00
pub fn delete_document ( & self , external_document_id : & str ) {
self . delete_documents ( vec! [ external_document_id . to_string ( ) ] )
}
2021-04-15 21:29:37 +08:00
}
2022-08-30 23:17:50 +08:00
#[ test ]
fn aborting_indexation ( ) {
use std ::sync ::atomic ::AtomicBool ;
use std ::sync ::atomic ::Ordering ::Relaxed ;
let index = TempIndex ::new ( ) ;
let mut wtxn = index . inner . write_txn ( ) . unwrap ( ) ;
let should_abort = AtomicBool ::new ( false ) ;
let builder = IndexDocuments ::new (
& mut wtxn ,
& index . inner ,
& index . indexer_config ,
index . index_documents_config . clone ( ) ,
| _ | ( ) ,
| | should_abort . load ( Relaxed ) ,
)
. unwrap ( ) ;
let ( builder , user_error ) = builder
. add_documents ( documents! ( [
{ " id " : 1 , " name " : " kevin " } ,
{ " id " : 2 , " name " : " bob " , " age " : 20 } ,
{ " id " : 2 , " name " : " bob " , " age " : 20 } ,
] ) )
. unwrap ( ) ;
user_error . unwrap ( ) ;
should_abort . store ( true , Relaxed ) ;
let err = builder . execute ( ) . unwrap_err ( ) ;
assert! ( matches! ( err , Error ::InternalError ( InternalError ::AbortedIndexation ) ) ) ;
}
2021-04-01 15:07:16 +08:00
#[ test ]
2021-06-17 21:16:20 +08:00
fn initial_field_distribution ( ) {
2022-08-02 21:13:06 +08:00
let index = TempIndex ::new ( ) ;
index
. add_documents ( documents! ( [
{ " id " : 1 , " name " : " kevin " } ,
{ " id " : 2 , " name " : " bob " , " age " : 20 } ,
{ " id " : 2 , " name " : " bob " , " age " : 20 } ,
] ) )
. unwrap ( ) ;
2021-03-31 23:14:23 +08:00
2024-05-06 20:49:45 +08:00
db_snap! ( index , field_distribution , @ r ###"
age 1 |
id 2 |
name 2 |
" ###);
2022-08-03 22:24:28 +08:00
db_snap! ( index , word_docids ,
2024-05-06 20:49:45 +08:00
@ r ###"
2022-08-03 22:24:28 +08:00
1 [ 0 , ]
2 [ 1 , ]
20 [ 1 , ]
bob [ 1 , ]
kevin [ 0 , ]
" ###
) ;
2021-06-17 23:05:34 +08:00
// we add all the documents a second time. we are supposed to get the same
// field_distribution in the end
2022-08-02 21:13:06 +08:00
index
. add_documents ( documents! ( [
{ " id " : 1 , " name " : " kevin " } ,
{ " id " : 2 , " name " : " bob " , " age " : 20 } ,
{ " id " : 2 , " name " : " bob " , " age " : 20 } ,
] ) )
. unwrap ( ) ;
2021-06-17 23:05:34 +08:00
2022-08-03 22:24:28 +08:00
db_snap! ( index , field_distribution ,
@ r ###"
2023-06-14 21:57:31 +08:00
age 1 |
id 2 |
name 2 |
2023-05-03 20:11:20 +08:00
" ###
2021-06-17 23:05:34 +08:00
) ;
// then we update a document by removing one field and another by adding one field
2022-08-02 21:13:06 +08:00
index
. add_documents ( documents! ( [
{ " id " : 1 , " name " : " kevin " , " has_dog " : true } ,
{ " id " : 2 , " name " : " bob " }
] ) )
. unwrap ( ) ;
2021-06-17 23:05:34 +08:00
2022-08-03 22:24:28 +08:00
db_snap! ( index , field_distribution ,
@ r ###"
2023-06-14 21:57:31 +08:00
has_dog 1 |
id 2 |
name 2 |
2023-05-03 20:11:20 +08:00
" ###
2021-06-17 23:05:34 +08:00
) ;
2021-03-31 23:14:23 +08:00
}
2022-03-31 15:54:49 +08:00
#[ test ]
fn put_and_retrieve_disable_typo ( ) {
let index = TempIndex ::new ( ) ;
let mut txn = index . write_txn ( ) . unwrap ( ) ;
// default value is true
assert! ( index . authorize_typos ( & txn ) . unwrap ( ) ) ;
// set to false
index . put_authorize_typos ( & mut txn , false ) . unwrap ( ) ;
txn . commit ( ) . unwrap ( ) ;
let txn = index . read_txn ( ) . unwrap ( ) ;
assert! ( ! index . authorize_typos ( & txn ) . unwrap ( ) ) ;
}
2022-03-31 19:50:18 +08:00
#[ test ]
fn set_min_word_len_for_typos ( ) {
let index = TempIndex ::new ( ) ;
let mut txn = index . write_txn ( ) . unwrap ( ) ;
2022-04-01 00:23:12 +08:00
assert_eq! ( index . min_word_len_one_typo ( & txn ) . unwrap ( ) , DEFAULT_MIN_WORD_LEN_ONE_TYPO ) ;
assert_eq! ( index . min_word_len_two_typos ( & txn ) . unwrap ( ) , DEFAULT_MIN_WORD_LEN_TWO_TYPOS ) ;
2022-03-31 19:50:18 +08:00
2022-04-01 00:23:12 +08:00
index . put_min_word_len_one_typo ( & mut txn , 3 ) . unwrap ( ) ;
index . put_min_word_len_two_typos ( & mut txn , 15 ) . unwrap ( ) ;
2022-03-31 19:50:18 +08:00
txn . commit ( ) . unwrap ( ) ;
let txn = index . read_txn ( ) . unwrap ( ) ;
2022-04-01 00:23:12 +08:00
assert_eq! ( index . min_word_len_one_typo ( & txn ) . unwrap ( ) , 3 ) ;
assert_eq! ( index . min_word_len_two_typos ( & txn ) . unwrap ( ) , 15 ) ;
2022-03-31 19:50:18 +08:00
}
2022-05-16 21:22:52 +08:00
#[ test ]
fn add_documents_and_set_searchable_fields ( ) {
2022-08-02 21:13:06 +08:00
let index = TempIndex ::new ( ) ;
index
. add_documents ( documents! ( [
{ " id " : 1 , " doggo " : " kevin " } ,
{ " id " : 2 , " doggo " : { " name " : " bob " , " age " : 20 } } ,
{ " id " : 3 , " name " : " jean " , " age " : 25 } ,
] ) )
. unwrap ( ) ;
index
. update_settings ( | settings | {
settings . set_searchable_fields ( vec! [ S ( " doggo " ) , S ( " name " ) ] ) ;
} )
. unwrap ( ) ;
2022-05-16 21:22:52 +08:00
// ensure we get the right real searchable fields + user defined searchable fields
let rtxn = index . read_txn ( ) . unwrap ( ) ;
2024-05-06 20:49:45 +08:00
let real = index . searchable_fields ( & rtxn ) . unwrap ( ) ;
2022-05-16 21:22:52 +08:00
assert_eq! ( real , & [ " doggo " , " name " , " doggo.name " , " doggo.age " ] ) ;
let user_defined = index . user_defined_searchable_fields ( & rtxn ) . unwrap ( ) . unwrap ( ) ;
assert_eq! ( user_defined , & [ " doggo " , " name " ] ) ;
}
#[ test ]
fn set_searchable_fields_and_add_documents ( ) {
2022-08-02 21:13:06 +08:00
let index = TempIndex ::new ( ) ;
2022-05-16 21:22:52 +08:00
2022-08-02 21:13:06 +08:00
index
. update_settings ( | settings | {
settings . set_searchable_fields ( vec! [ S ( " doggo " ) , S ( " name " ) ] ) ;
} )
. unwrap ( ) ;
2022-05-16 21:22:52 +08:00
// ensure we get the right real searchable fields + user defined searchable fields
let rtxn = index . read_txn ( ) . unwrap ( ) ;
2024-05-06 20:49:45 +08:00
let real = index . searchable_fields ( & rtxn ) . unwrap ( ) ;
2022-05-16 21:22:52 +08:00
assert_eq! ( real , & [ " doggo " , " name " ] ) ;
let user_defined = index . user_defined_searchable_fields ( & rtxn ) . unwrap ( ) . unwrap ( ) ;
assert_eq! ( user_defined , & [ " doggo " , " name " ] ) ;
2022-08-02 21:13:06 +08:00
index
. add_documents ( documents! ( [
{ " id " : 1 , " doggo " : " kevin " } ,
{ " id " : 2 , " doggo " : { " name " : " bob " , " age " : 20 } } ,
{ " id " : 3 , " name " : " jean " , " age " : 25 } ,
] ) )
. unwrap ( ) ;
2022-05-16 21:22:52 +08:00
// ensure we get the right real searchable fields + user defined searchable fields
let rtxn = index . read_txn ( ) . unwrap ( ) ;
2024-05-06 20:49:45 +08:00
let real = index . searchable_fields ( & rtxn ) . unwrap ( ) ;
2022-05-16 21:22:52 +08:00
assert_eq! ( real , & [ " doggo " , " name " , " doggo.name " , " doggo.age " ] ) ;
let user_defined = index . user_defined_searchable_fields ( & rtxn ) . unwrap ( ) . unwrap ( ) ;
assert_eq! ( user_defined , & [ " doggo " , " name " ] ) ;
}
2022-12-05 17:26:53 +08:00
2023-02-03 01:19:56 +08:00
#[ test ]
fn test_basic_geo_bounding_box ( ) {
let index = TempIndex ::new ( ) ;
index
. update_settings ( | settings | {
settings . set_filterable_fields ( hashset! { S ( " _geo " ) } ) ;
} )
. unwrap ( ) ;
index
. add_documents ( documents! ( [
2023-08-08 22:29:25 +08:00
{ " id " : 0 , " _geo " : { " lat " : " 0 " , " lng " : " 0 " } } ,
{ " id " : 1 , " _geo " : { " lat " : 0 , " lng " : " -175 " } } ,
{ " id " : 2 , " _geo " : { " lat " : " 0 " , " lng " : 175 } } ,
2023-02-03 01:19:56 +08:00
{ " id " : 3 , " _geo " : { " lat " : 85 , " lng " : 0 } } ,
2023-08-08 22:29:25 +08:00
{ " id " : 4 , " _geo " : { " lat " : " -85 " , " lng " : " 0 " } } ,
2023-02-03 01:19:56 +08:00
] ) )
. unwrap ( ) ;
// ensure we get the right real searchable fields + user defined searchable fields
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let mut search = index . search ( & rtxn ) ;
// exact match a document
let search_result = search
2023-02-06 23:50:27 +08:00
. filter ( Filter ::from_str ( " _geoBoundingBox([0, 0], [0, 0]) " ) . unwrap ( ) . unwrap ( ) )
2023-02-03 01:19:56 +08:00
. execute ( )
. unwrap ( ) ;
insta ::assert_debug_snapshot! ( search_result . candidates , @ " RoaringBitmap<[0]> " ) ;
// match a document in the middle of the rectangle
let search_result = search
2023-03-29 00:26:18 +08:00
. filter ( Filter ::from_str ( " _geoBoundingBox([10, 10], [-10, -10]) " ) . unwrap ( ) . unwrap ( ) )
2023-02-03 01:19:56 +08:00
. execute ( )
. unwrap ( ) ;
insta ::assert_debug_snapshot! ( search_result . candidates , @ " RoaringBitmap<[0]> " ) ;
// select everything
let search_result = search
2023-03-29 00:26:18 +08:00
. filter ( Filter ::from_str ( " _geoBoundingBox([90, 180], [-90, -180]) " ) . unwrap ( ) . unwrap ( ) )
2023-02-03 01:19:56 +08:00
. execute ( )
. unwrap ( ) ;
insta ::assert_debug_snapshot! ( search_result . candidates , @ " RoaringBitmap<[0, 1, 2, 3, 4]> " ) ;
// go on the edge of the longitude
let search_result = search
2023-03-29 00:26:18 +08:00
. filter ( Filter ::from_str ( " _geoBoundingBox([0, -170], [0, 180]) " ) . unwrap ( ) . unwrap ( ) )
2023-02-03 01:19:56 +08:00
. execute ( )
. unwrap ( ) ;
insta ::assert_debug_snapshot! ( search_result . candidates , @ " RoaringBitmap<[1]> " ) ;
// go on the other edge of the longitude
let search_result = search
2023-03-29 00:26:18 +08:00
. filter ( Filter ::from_str ( " _geoBoundingBox([0, -180], [0, 170]) " ) . unwrap ( ) . unwrap ( ) )
2023-02-03 01:19:56 +08:00
. execute ( )
. unwrap ( ) ;
insta ::assert_debug_snapshot! ( search_result . candidates , @ " RoaringBitmap<[2]> " ) ;
// wrap around the longitude
let search_result = search
2023-03-29 00:26:18 +08:00
. filter ( Filter ::from_str ( " _geoBoundingBox([0, -170], [0, 170]) " ) . unwrap ( ) . unwrap ( ) )
2023-02-03 01:19:56 +08:00
. execute ( )
. unwrap ( ) ;
insta ::assert_debug_snapshot! ( search_result . candidates , @ " RoaringBitmap<[1, 2]> " ) ;
// go on the edge of the latitude
let search_result = search
2023-02-06 23:50:27 +08:00
. filter ( Filter ::from_str ( " _geoBoundingBox([90, 0], [80, 0]) " ) . unwrap ( ) . unwrap ( ) )
2023-02-03 01:19:56 +08:00
. execute ( )
. unwrap ( ) ;
insta ::assert_debug_snapshot! ( search_result . candidates , @ " RoaringBitmap<[3]> " ) ;
// go on the edge of the latitude
let search_result = search
2023-02-06 23:50:27 +08:00
. filter ( Filter ::from_str ( " _geoBoundingBox([-80, 0], [-90, 0]) " ) . unwrap ( ) . unwrap ( ) )
2023-02-03 01:19:56 +08:00
. execute ( )
. unwrap ( ) ;
insta ::assert_debug_snapshot! ( search_result . candidates , @ " RoaringBitmap<[4]> " ) ;
2023-02-07 01:07:00 +08:00
// the requests that don't make sense
2023-02-07 00:50:47 +08:00
2023-02-03 01:19:56 +08:00
// try to wrap around the latitude
2023-02-07 00:50:47 +08:00
let error = search
2023-02-06 23:50:27 +08:00
. filter ( Filter ::from_str ( " _geoBoundingBox([-80, 0], [80, 0]) " ) . unwrap ( ) . unwrap ( ) )
2023-02-03 01:19:56 +08:00
. execute ( )
2023-02-07 00:50:47 +08:00
. unwrap_err ( ) ;
2024-07-09 00:38:05 +08:00
insta ::assert_snapshot! (
2023-03-29 00:26:18 +08:00
error ,
@ r ###"
2023-02-07 00:50:47 +08:00
The top latitude ` - 80 ` is below the bottom latitude ` 80 ` .
32 :33 _geoBoundingBox ( [ - 80 , 0 ] , [ 80 , 0 ] )
2023-03-29 00:26:18 +08:00
" ###
) ;
2023-02-03 01:19:56 +08:00
// send a top latitude lower than the bottow latitude
2023-02-07 00:50:47 +08:00
let error = search
2023-02-06 23:50:27 +08:00
. filter ( Filter ::from_str ( " _geoBoundingBox([-10, 0], [10, 0]) " ) . unwrap ( ) . unwrap ( ) )
2023-02-03 01:19:56 +08:00
. execute ( )
2023-02-07 00:50:47 +08:00
. unwrap_err ( ) ;
2024-07-09 00:38:05 +08:00
insta ::assert_snapshot! (
2023-03-29 00:26:18 +08:00
error ,
@ r ###"
2023-02-07 00:50:47 +08:00
The top latitude ` - 10 ` is below the bottom latitude ` 10 ` .
32 :33 _geoBoundingBox ( [ - 10 , 0 ] , [ 10 , 0 ] )
2023-03-29 00:26:18 +08:00
" ###
) ;
2023-02-03 01:19:56 +08:00
}
2024-07-17 17:13:37 +08:00
#[ test ]
fn test_contains ( ) {
let index = TempIndex ::new ( ) ;
index
. update_settings ( | settings | {
settings . set_filterable_fields ( hashset! { S ( " doggo " ) } ) ;
} )
. unwrap ( ) ;
index
. add_documents ( documents! ( [
{ " id " : 0 , " doggo " : " kefir " } ,
{ " id " : 1 , " doggo " : " kefirounet " } ,
{ " id " : 2 , " doggo " : " kefkef " } ,
{ " id " : 3 , " doggo " : " fifir " } ,
{ " id " : 4 , " doggo " : " boubou " } ,
{ " id " : 5 } ,
] ) )
. unwrap ( ) ;
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let mut search = index . search ( & rtxn ) ;
let search_result = search
. filter ( Filter ::from_str ( " doggo CONTAINS kefir " ) . unwrap ( ) . unwrap ( ) )
. execute ( )
. unwrap ( ) ;
insta ::assert_debug_snapshot! ( search_result . candidates , @ " RoaringBitmap<[0, 1]> " ) ;
let mut search = index . search ( & rtxn ) ;
let search_result = search
. filter ( Filter ::from_str ( " doggo CONTAINS KEF " ) . unwrap ( ) . unwrap ( ) )
. execute ( )
. unwrap ( ) ;
insta ::assert_debug_snapshot! ( search_result . candidates , @ " RoaringBitmap<[0, 1, 2]> " ) ;
let mut search = index . search ( & rtxn ) ;
let search_result = search
. filter ( Filter ::from_str ( " doggo NOT CONTAINS fir " ) . unwrap ( ) . unwrap ( ) )
. execute ( )
. unwrap ( ) ;
insta ::assert_debug_snapshot! ( search_result . candidates , @ " RoaringBitmap<[2, 4, 5]> " ) ;
}
2022-12-05 17:26:53 +08:00
#[ test ]
fn replace_documents_external_ids_and_soft_deletion_check ( ) {
use big_s ::S ;
use maplit ::hashset ;
2023-10-25 20:49:25 +08:00
let index = TempIndex ::new ( ) ;
2022-12-05 17:26:53 +08:00
index
. update_settings ( | settings | {
settings . set_primary_key ( " id " . to_owned ( ) ) ;
settings . set_filterable_fields ( hashset! { S ( " doggo " ) } ) ;
} )
. unwrap ( ) ;
let mut docs = vec! [ ] ;
for i in 0 .. 4 {
docs . push ( serde_json ::json! (
{ " id " : i , " doggo " : i }
) ) ;
}
index . add_documents ( documents! ( docs ) ) . unwrap ( ) ;
db_snap! ( index , documents_ids , @ " [0, 1, 2, 3, ] " ) ;
db_snap! ( index , external_documents_ids , 1 , @ r ###"
2023-10-26 00:02:43 +08:00
docids :
2022-12-05 17:26:53 +08:00
0 0
1 1
2 2
3 3
" ###);
db_snap! ( index , facet_id_f64_docids , 1 , @ r ###"
1 0 0 1 [ 0 , ]
1 0 1 1 [ 1 , ]
1 0 2 1 [ 2 , ]
1 0 3 1 [ 3 , ]
" ###);
let mut docs = vec! [ ] ;
for i in 0 .. 3 {
docs . push ( serde_json ::json! (
{ " id " : i , " doggo " : i + 1 }
) ) ;
}
index . add_documents ( documents! ( docs ) ) . unwrap ( ) ;
2023-10-26 00:02:43 +08:00
db_snap! ( index , documents_ids , @ " [0, 1, 2, 3, ] " ) ;
2022-12-05 17:26:53 +08:00
db_snap! ( index , external_documents_ids , 2 , @ r ###"
2023-10-26 00:02:43 +08:00
docids :
0 0
1 1
2 2
2022-12-05 17:26:53 +08:00
3 3
" ###);
db_snap! ( index , facet_id_f64_docids , 2 , @ r ###"
2023-10-31 17:08:36 +08:00
1 0 1 1 [ 0 , ]
1 0 2 1 [ 1 , ]
1 0 3 1 [ 2 , 3 , ]
2022-12-05 17:26:53 +08:00
" ###);
index
. add_documents ( documents! ( [ { " id " : 3 , " doggo " : 4 } , { " id " : 3 , " doggo " : 5 } , { " id " : 3 , " doggo " : 4 } ] ) )
. unwrap ( ) ;
2023-10-26 00:02:43 +08:00
db_snap! ( index , documents_ids , @ " [0, 1, 2, 3, ] " ) ;
2022-12-05 17:26:53 +08:00
db_snap! ( index , external_documents_ids , 3 , @ r ###"
2023-10-26 00:02:43 +08:00
docids :
0 0
1 1
2 2
2022-12-05 17:26:53 +08:00
3 3
" ###);
db_snap! ( index , facet_id_f64_docids , 3 , @ r ###"
2023-10-31 17:08:36 +08:00
1 0 1 1 [ 0 , ]
1 0 2 1 [ 1 , ]
1 0 3 1 [ 2 , ]
1 0 4 1 [ 3 , ]
2022-12-05 17:26:53 +08:00
" ###);
index
. update_settings ( | settings | {
settings . set_distinct_field ( " id " . to_owned ( ) ) ;
} )
. unwrap ( ) ;
2023-10-26 00:02:43 +08:00
db_snap! ( index , documents_ids , @ " [0, 1, 2, 3, ] " ) ;
2022-12-05 17:26:53 +08:00
db_snap! ( index , external_documents_ids , 3 , @ r ###"
2023-10-26 00:02:43 +08:00
docids :
0 0
1 1
2 2
3 3
2022-12-05 17:26:53 +08:00
" ###);
db_snap! ( index , facet_id_f64_docids , 3 , @ r ###"
2023-10-30 21:48:41 +08:00
0 0 0 1 [ 0 , ]
0 0 1 1 [ 1 , ]
0 0 2 1 [ 2 , ]
0 0 3 1 [ 3 , ]
1 0 1 1 [ 0 , ]
1 0 2 1 [ 1 , ]
1 0 3 1 [ 2 , ]
1 0 4 1 [ 3 , ]
2022-12-05 17:26:53 +08:00
" ###);
}
2022-12-06 18:38:15 +08:00
#[ test ]
fn bug_3021_first ( ) {
// https://github.com/meilisearch/meilisearch/issues/3021
let mut index = TempIndex ::new ( ) ;
index . index_documents_config . update_method = IndexDocumentsMethod ::ReplaceDocuments ;
index
. update_settings ( | settings | {
settings . set_primary_key ( " primary_key " . to_owned ( ) ) ;
} )
. unwrap ( ) ;
index
. add_documents ( documents! ( [
{ " primary_key " : 38 } ,
{ " primary_key " : 34 }
] ) )
. unwrap ( ) ;
db_snap! ( index , documents_ids , @ " [0, 1, ] " ) ;
db_snap! ( index , external_documents_ids , 1 , @ r ###"
2023-10-26 00:02:43 +08:00
docids :
2022-12-06 18:38:15 +08:00
34 1
38 0
" ###);
2022-12-19 16:47:29 +08:00
index . delete_document ( " 34 " ) ;
2022-12-06 18:38:15 +08:00
db_snap! ( index , documents_ids , @ " [0, ] " ) ;
db_snap! ( index , external_documents_ids , 2 , @ r ###"
2023-10-26 00:02:43 +08:00
docids :
2022-12-06 18:38:15 +08:00
38 0
" ###);
index
. update_settings ( | s | {
s . set_searchable_fields ( vec! [ ] ) ;
} )
. unwrap ( ) ;
// The key point of the test is to verify that the external documents ids
// do not contain any entry for previously soft-deleted document ids
db_snap! ( index , documents_ids , @ " [0, ] " ) ;
db_snap! ( index , external_documents_ids , 3 , @ r ###"
2023-10-26 00:02:43 +08:00
docids :
2022-12-06 18:38:15 +08:00
38 0
" ###);
// So that this document addition works correctly now.
// It would be wrongly interpreted as a replacement before
index . add_documents ( documents! ( { " primary_key " : 34 } ) ) . unwrap ( ) ;
db_snap! ( index , documents_ids , @ " [0, 1, ] " ) ;
db_snap! ( index , external_documents_ids , 4 , @ r ###"
2023-10-26 00:02:43 +08:00
docids :
2022-12-06 18:38:15 +08:00
34 1
38 0
" ###);
// We do the test again, but deleting the document with id 0 instead of id 1 now
2022-12-19 16:47:29 +08:00
index . delete_document ( " 38 " ) ;
2022-12-06 18:38:15 +08:00
db_snap! ( index , documents_ids , @ " [1, ] " ) ;
db_snap! ( index , external_documents_ids , 5 , @ r ###"
2023-10-26 00:02:43 +08:00
docids :
2022-12-06 18:38:15 +08:00
34 1
" ###);
index
. update_settings ( | s | {
s . set_searchable_fields ( vec! [ " primary_key " . to_owned ( ) ] ) ;
} )
. unwrap ( ) ;
db_snap! ( index , documents_ids , @ " [1, ] " ) ;
db_snap! ( index , external_documents_ids , 6 , @ r ###"
2023-10-26 00:02:43 +08:00
docids :
2022-12-06 18:38:15 +08:00
34 1
" ###);
// And adding lots of documents afterwards instead of just one.
// These extra subtests don't add much, but it's better than nothing.
index . add_documents ( documents! ( [ { " primary_key " : 38 } , { " primary_key " : 39 } , { " primary_key " : 41 } , { " primary_key " : 40 } , { " primary_key " : 41 } , { " primary_key " : 42 } ] ) ) . unwrap ( ) ;
db_snap! ( index , documents_ids , @ " [0, 1, 2, 3, 4, 5, ] " ) ;
db_snap! ( index , external_documents_ids , 7 , @ r ###"
2023-10-26 00:02:43 +08:00
docids :
2022-12-06 18:38:15 +08:00
34 1
38 0
39 2
40 4
41 3
42 5
" ###);
}
2023-10-25 23:32:36 +08:00
#[ test ]
fn simple_delete ( ) {
let mut index = TempIndex ::new ( ) ;
index . index_documents_config . update_method = IndexDocumentsMethod ::UpdateDocuments ;
index
. add_documents ( documents! ( [
{ " id " : 30 } ,
{ " id " : 34 }
] ) )
. unwrap ( ) ;
db_snap! ( index , documents_ids , @ " [0, 1, ] " ) ;
db_snap! ( index , external_documents_ids , 1 , @ r ###"
docids :
30 0
34 1 " ###);
index . delete_document ( " 34 " ) ;
db_snap! ( index , documents_ids , @ " [0, ] " ) ;
db_snap! ( index , external_documents_ids , 2 , @ r ###"
docids :
30 0
" ###);
}
2022-12-06 18:38:15 +08:00
#[ test ]
fn bug_3021_second ( ) {
// https://github.com/meilisearch/meilisearch/issues/3021
let mut index = TempIndex ::new ( ) ;
index . index_documents_config . update_method = IndexDocumentsMethod ::UpdateDocuments ;
index
. update_settings ( | settings | {
settings . set_primary_key ( " primary_key " . to_owned ( ) ) ;
} )
. unwrap ( ) ;
index
. add_documents ( documents! ( [
{ " primary_key " : 30 } ,
{ " primary_key " : 34 }
] ) )
. unwrap ( ) ;
db_snap! ( index , documents_ids , @ " [0, 1, ] " ) ;
db_snap! ( index , external_documents_ids , 1 , @ r ###"
2023-10-26 00:02:43 +08:00
docids :
2022-12-06 18:38:15 +08:00
30 0
34 1
" ###);
2022-12-19 16:47:29 +08:00
index . delete_document ( " 34 " ) ;
2022-12-06 18:38:15 +08:00
db_snap! ( index , documents_ids , @ " [0, ] " ) ;
db_snap! ( index , external_documents_ids , 2 , @ r ###"
2023-10-26 00:02:43 +08:00
docids :
2022-12-06 18:38:15 +08:00
30 0
" ###);
index
. update_settings ( | s | {
s . set_searchable_fields ( vec! [ ] ) ;
} )
. unwrap ( ) ;
// The key point of the test is to verify that the external documents ids
// do not contain any entry for previously soft-deleted document ids
db_snap! ( index , documents_ids , @ " [0, ] " ) ;
db_snap! ( index , external_documents_ids , 3 , @ r ###"
2023-10-26 00:02:43 +08:00
docids :
2022-12-06 18:38:15 +08:00
30 0
" ###);
// So that when we add a new document
index . add_documents ( documents! ( { " primary_key " : 35 , " b " : 2 } ) ) . unwrap ( ) ;
db_snap! ( index , documents_ids , @ " [0, 1, ] " ) ;
// The external documents ids don't have several external ids pointing to the same
// internal document id
db_snap! ( index , external_documents_ids , 4 , @ r ###"
2023-10-26 00:02:43 +08:00
docids :
2022-12-06 18:38:15 +08:00
30 0
35 1
" ###);
// And when we add 34 again, we don't replace document 35
index . add_documents ( documents! ( { " primary_key " : 34 , " a " : 1 } ) ) . unwrap ( ) ;
// And document 35 still exists, is not deleted
db_snap! ( index , documents_ids , @ " [0, 1, 2, ] " ) ;
db_snap! ( index , external_documents_ids , 5 , @ r ###"
2023-10-26 00:02:43 +08:00
docids :
2022-12-06 18:38:15 +08:00
30 0
34 2
35 1
" ###);
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let ( _docid , obkv ) = index . documents ( & rtxn , [ 0 ] ) . unwrap ( ) [ 0 ] ;
let json = obkv_to_json ( & [ 0 , 1 , 2 ] , & index . fields_ids_map ( & rtxn ) . unwrap ( ) , obkv ) . unwrap ( ) ;
insta ::assert_debug_snapshot! ( json , @ r ###"
{
" primary_key " : Number ( 30 ) ,
}
" ###);
// Furthermore, when we retrieve document 34, it is not the result of merging 35 with 34
let ( _docid , obkv ) = index . documents ( & rtxn , [ 2 ] ) . unwrap ( ) [ 0 ] ;
let json = obkv_to_json ( & [ 0 , 1 , 2 ] , & index . fields_ids_map ( & rtxn ) . unwrap ( ) , obkv ) . unwrap ( ) ;
insta ::assert_debug_snapshot! ( json , @ r ###"
{
" primary_key " : Number ( 34 ) ,
" a " : Number ( 1 ) ,
}
" ###);
drop ( rtxn ) ;
// Add new documents again
index
. add_documents (
documents! ( [ { " primary_key " : 37 } , { " primary_key " : 38 } , { " primary_key " : 39 } ] ) ,
)
. unwrap ( ) ;
db_snap! ( index , documents_ids , @ " [0, 1, 2, 3, 4, 5, ] " ) ;
db_snap! ( index , external_documents_ids , 6 , @ r ###"
2023-10-26 00:02:43 +08:00
docids :
2022-12-06 18:38:15 +08:00
30 0
34 2
35 1
37 3
38 4
39 5
" ###);
}
2022-12-09 02:18:25 +08:00
#[ test ]
fn bug_3021_third ( ) {
// https://github.com/meilisearch/meilisearch/issues/3021
let mut index = TempIndex ::new ( ) ;
index . index_documents_config . update_method = IndexDocumentsMethod ::UpdateDocuments ;
index
. update_settings ( | settings | {
settings . set_primary_key ( " primary_key " . to_owned ( ) ) ;
} )
. unwrap ( ) ;
index
. add_documents ( documents! ( [
{ " primary_key " : 3 } ,
{ " primary_key " : 4 } ,
{ " primary_key " : 5 }
] ) )
. unwrap ( ) ;
db_snap! ( index , documents_ids , @ " [0, 1, 2, ] " ) ;
db_snap! ( index , external_documents_ids , 1 , @ r ###"
2023-10-26 00:02:43 +08:00
docids :
2022-12-09 02:18:25 +08:00
3 0
4 1
5 2
" ###);
2022-12-19 16:47:29 +08:00
index . delete_document ( " 3 " ) ;
2022-12-09 02:18:25 +08:00
db_snap! ( index , documents_ids , @ " [1, 2, ] " ) ;
db_snap! ( index , external_documents_ids , 2 , @ r ###"
2023-10-26 00:02:43 +08:00
docids :
2022-12-09 02:18:25 +08:00
4 1
5 2
" ###);
index . add_documents ( documents! ( [ { " primary_key " : " 4 " , " a " : 2 } ] ) ) . unwrap ( ) ;
2023-10-26 00:02:43 +08:00
db_snap! ( index , documents_ids , @ " [1, 2, ] " ) ;
2022-12-09 02:18:25 +08:00
db_snap! ( index , external_documents_ids , 2 , @ r ###"
2023-10-26 00:02:43 +08:00
docids :
4 1
2022-12-09 02:18:25 +08:00
5 2
" ###);
index
. add_documents ( documents! ( [
{ " primary_key " : " 3 " } ,
] ) )
. unwrap ( ) ;
2022-12-12 19:42:55 +08:00
2023-10-26 00:02:43 +08:00
db_snap! ( index , documents_ids , @ " [0, 1, 2, ] " ) ;
2022-12-12 19:42:55 +08:00
db_snap! ( index , external_documents_ids , 2 , @ r ###"
2023-10-26 00:02:43 +08:00
docids :
2022-12-12 19:42:55 +08:00
3 0
2023-10-26 00:02:43 +08:00
4 1
2022-12-12 19:42:55 +08:00
5 2
" ###);
2022-12-09 02:18:25 +08:00
}
2022-12-20 17:37:50 +08:00
#[ test ]
fn bug_3021_fourth ( ) {
// https://github.com/meilisearch/meilisearch/issues/3021
let mut index = TempIndex ::new ( ) ;
index . index_documents_config . update_method = IndexDocumentsMethod ::UpdateDocuments ;
index
. update_settings ( | settings | {
settings . set_primary_key ( " primary_key " . to_owned ( ) ) ;
} )
. unwrap ( ) ;
index
. add_documents ( documents! ( [
{ " primary_key " : 11 } ,
{ " primary_key " : 4 } ,
] ) )
. unwrap ( ) ;
db_snap! ( index , documents_ids , @ " [0, 1, ] " ) ;
db_snap! ( index , external_documents_ids , @ r ###"
2023-10-26 00:02:43 +08:00
docids :
2022-12-20 17:37:50 +08:00
11 0
4 1
" ###);
2024-05-07 23:56:40 +08:00
db_snap! ( index , fields_ids_map , @ r ###"
0 primary_key |
" ###);
db_snap! ( index , searchable_fields , @ r ### "["primary_key"]"### ) ;
db_snap! ( index , fieldids_weights_map , @ r ###"
fid weight
0 0 |
" ###);
2022-12-20 17:37:50 +08:00
index
. add_documents ( documents! ( [
{ " primary_key " : 4 , " a " : 0 } ,
{ " primary_key " : 1 } ,
] ) )
. unwrap ( ) ;
2023-10-26 00:02:43 +08:00
db_snap! ( index , documents_ids , @ " [0, 1, 2, ] " ) ;
2022-12-20 17:37:50 +08:00
db_snap! ( index , external_documents_ids , @ r ###"
2023-10-26 00:02:43 +08:00
docids :
1 2
2022-12-20 17:37:50 +08:00
11 0
2023-10-26 00:02:43 +08:00
4 1
2022-12-20 17:37:50 +08:00
" ###);
2024-05-07 23:56:40 +08:00
db_snap! ( index , fields_ids_map , @ r ###"
0 primary_key |
1 a |
" ###);
db_snap! ( index , searchable_fields , @ r ### "["primary_key", "a"]"### ) ;
db_snap! ( index , fieldids_weights_map , @ r ###"
fid weight
0 0 |
2024-05-16 07:06:33 +08:00
1 0 |
2024-05-07 23:56:40 +08:00
" ###);
2022-12-20 17:37:50 +08:00
2023-10-25 20:49:25 +08:00
index . delete_documents ( Default ::default ( ) ) ;
2022-12-20 17:37:50 +08:00
2023-10-26 00:02:43 +08:00
db_snap! ( index , documents_ids , @ " [0, 1, 2, ] " ) ;
2022-12-20 17:37:50 +08:00
db_snap! ( index , external_documents_ids , @ r ###"
2023-10-26 00:02:43 +08:00
docids :
1 2
2022-12-20 17:37:50 +08:00
11 0
2023-10-26 00:02:43 +08:00
4 1
2022-12-20 17:37:50 +08:00
" ###);
2024-05-07 23:56:40 +08:00
db_snap! ( index , fields_ids_map , @ r ###"
0 primary_key |
1 a |
" ###);
db_snap! ( index , searchable_fields , @ r ### "["primary_key", "a"]"### ) ;
db_snap! ( index , fieldids_weights_map , @ r ###"
fid weight
0 0 |
2024-05-16 07:06:33 +08:00
1 0 |
2024-05-07 23:56:40 +08:00
" ###);
2022-12-20 17:37:50 +08:00
index
. add_documents ( documents! ( [
{ " primary_key " : 4 , " a " : 1 } ,
{ " primary_key " : 1 , " a " : 0 } ,
] ) )
. unwrap ( ) ;
2023-10-26 00:02:43 +08:00
db_snap! ( index , documents_ids , @ " [0, 1, 2, ] " ) ;
2022-12-20 17:37:50 +08:00
db_snap! ( index , external_documents_ids , @ r ###"
2023-10-26 00:02:43 +08:00
docids :
1 2
2022-12-20 17:37:50 +08:00
11 0
4 1
" ###);
2024-05-07 23:56:40 +08:00
db_snap! ( index , fields_ids_map , @ r ###"
0 primary_key |
1 a |
" ###);
db_snap! ( index , searchable_fields , @ r ### "["primary_key", "a"]"### ) ;
db_snap! ( index , fieldids_weights_map , @ r ###"
fid weight
0 0 |
2024-05-16 07:06:33 +08:00
1 0 |
2024-05-07 23:56:40 +08:00
" ###);
2022-12-20 17:37:50 +08:00
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let search = Search ::new ( & rtxn , & index ) ;
2023-06-07 00:26:33 +08:00
let SearchResult {
matching_words : _ ,
candidates : _ ,
document_scores : _ ,
mut documents_ids ,
2024-03-05 18:21:46 +08:00
degraded : _ ,
2024-03-27 01:07:43 +08:00
used_negative_operator : _ ,
2023-06-07 00:26:33 +08:00
} = search . execute ( ) . unwrap ( ) ;
2022-12-20 17:37:50 +08:00
let primary_key_id = index . fields_ids_map ( & rtxn ) . unwrap ( ) . id ( " primary_key " ) . unwrap ( ) ;
documents_ids . sort_unstable ( ) ;
let docs = index . documents ( & rtxn , documents_ids ) . unwrap ( ) ;
let mut all_ids = HashSet ::new ( ) ;
for ( _docid , obkv ) in docs {
let id = obkv . get ( primary_key_id ) . unwrap ( ) ;
assert! ( all_ids . insert ( id ) ) ;
}
}
2023-01-18 20:24:26 +08:00
#[ test ]
fn bug_3007 ( ) {
// https://github.com/meilisearch/meilisearch/issues/3007
use crate ::error ::{ GeoError , UserError } ;
let index = TempIndex ::new ( ) ;
// Given is an index with a geo field NOT contained in the sortable_fields of the settings
index
. update_settings ( | settings | {
settings . set_primary_key ( " id " . to_string ( ) ) ;
settings . set_filterable_fields ( HashSet ::from ( [ " _geo " . to_string ( ) ] ) ) ;
} )
. unwrap ( ) ;
// happy path
index . add_documents ( documents! ( { " id " : 5 , " _geo " : { " lat " : 12.0 , " lng " : 11.0 } } ) ) . unwrap ( ) ;
db_snap! ( index , geo_faceted_documents_ids ) ;
// both are unparseable, we expect GeoError::BadLatitudeAndLongitude
let err1 = index
. add_documents (
documents! ( { " id " : 6 , " _geo " : { " lat " : " unparseable " , " lng " : " unparseable " } } ) ,
)
. unwrap_err ( ) ;
assert! ( matches! (
err1 ,
2023-01-18 20:24:46 +08:00
Error ::UserError ( UserError ::InvalidGeoField ( GeoError ::BadLatitudeAndLongitude { .. } ) )
2023-01-18 20:24:26 +08:00
) ) ;
db_snap! ( index , geo_faceted_documents_ids ) ; // ensure that no more document was inserted
}
2023-01-24 19:20:50 +08:00
#[ test ]
fn unexpected_extra_fields_in_geo_field ( ) {
let index = TempIndex ::new ( ) ;
index
. update_settings ( | settings | {
settings . set_primary_key ( " id " . to_string ( ) ) ;
settings . set_filterable_fields ( HashSet ::from ( [ " _geo " . to_string ( ) ] ) ) ;
} )
. unwrap ( ) ;
let err = index
. add_documents (
documents! ( { " id " : " doggo " , " _geo " : { " lat " : 1 , " lng " : 2 , " doggo " : " are the best " } } ) ,
)
. unwrap_err ( ) ;
2024-07-09 00:38:05 +08:00
insta ::assert_snapshot! ( err , @ r ### "The `_geo` field in the document with the id: `"\"doggo\""` contains the following unexpected fields: `{"doggo":"are the best"}`."### ) ;
2023-01-24 19:20:50 +08:00
db_snap! ( index , geo_faceted_documents_ids ) ; // ensure that no documents were inserted
// multiple fields and complex values
let err = index
. add_documents (
documents! ( { " id " : " doggo " , " _geo " : { " lat " : 1 , " lng " : 2 , " doggo " : " are the best " , " and " : { " all " : [ " cats " , { " are " : " beautiful " } ] } } } ) ,
)
. unwrap_err ( ) ;
2024-07-09 00:38:05 +08:00
insta ::assert_snapshot! ( err , @ r ### "The `_geo` field in the document with the id: `"\"doggo\""` contains the following unexpected fields: `{"and":{"all":["cats",{"are":"beautiful"}]},"doggo":"are the best"}`."### ) ;
2023-01-24 19:20:50 +08:00
db_snap! ( index , geo_faceted_documents_ids ) ; // ensure that no documents were inserted
}
2024-05-13 22:18:05 +08:00
#[ test ]
fn swapping_searchable_attributes ( ) {
// See https://github.com/meilisearch/meilisearch/issues/4484
let index = TempIndex ::new ( ) ;
index
. update_settings ( | settings | {
settings . set_searchable_fields ( vec! [ S ( " name " ) ] ) ;
settings . set_filterable_fields ( HashSet ::from ( [ S ( " age " ) ] ) ) ;
} )
. unwrap ( ) ;
index
. add_documents ( documents! ( { " id " : 1 , " name " : " Many " , " age " : 28 , " realName " : " Maxime " } ) )
. unwrap ( ) ;
db_snap! ( index , fields_ids_map , @ r ###"
0 name |
1 id |
2 age |
3 realName |
" ###);
db_snap! ( index , searchable_fields , @ r ### "["name"]"### ) ;
db_snap! ( index , fieldids_weights_map , @ r ###"
fid weight
0 0 |
" ###);
index
. update_settings ( | settings | {
settings . set_searchable_fields ( vec! [ S ( " name " ) , S ( " realName " ) ] ) ;
settings . set_filterable_fields ( HashSet ::from ( [ S ( " age " ) ] ) ) ;
} )
. unwrap ( ) ;
2024-05-14 16:45:06 +08:00
2024-05-13 22:49:08 +08:00
// The order of the field id map shouldn't change
2024-05-13 22:18:05 +08:00
db_snap! ( index , fields_ids_map , @ r ###"
0 name |
2024-05-13 22:49:08 +08:00
1 id |
2 age |
3 realName |
2024-05-13 22:18:05 +08:00
" ###);
db_snap! ( index , searchable_fields , @ r ### "["name", "realName"]"### ) ;
db_snap! ( index , fieldids_weights_map , @ r ###"
fid weight
0 0 |
2024-05-13 22:49:08 +08:00
3 1 |
2024-05-13 22:18:05 +08:00
" ###);
}
2024-05-14 16:45:06 +08:00
#[ test ]
fn attribute_weights_after_swapping_searchable_attributes ( ) {
// See https://github.com/meilisearch/meilisearch/issues/4484
let index = TempIndex ::new ( ) ;
index
. update_settings ( | settings | {
settings . set_searchable_fields ( vec! [ S ( " name " ) , S ( " beverage " ) ] ) ;
} )
. unwrap ( ) ;
index
. add_documents ( documents! ( [
{ " id " : 0 , " name " : " kefir " , " beverage " : " water " } ,
{ " id " : 1 , " name " : " tamo " , " beverage " : " kefir " }
] ) )
. unwrap ( ) ;
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let mut search = index . search ( & rtxn ) ;
let results = search . query ( " kefir " ) . execute ( ) . unwrap ( ) ;
// We should find kefir the dog first
insta ::assert_debug_snapshot! ( results . documents_ids , @ r ###"
[
0 ,
1 ,
]
" ###);
index
. update_settings ( | settings | {
settings . set_searchable_fields ( vec! [ S ( " beverage " ) , S ( " name " ) ] ) ;
} )
. unwrap ( ) ;
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let mut search = index . search ( & rtxn ) ;
let results = search . query ( " kefir " ) . execute ( ) . unwrap ( ) ;
// We should find tamo first
insta ::assert_debug_snapshot! ( results . documents_ids , @ r ###"
[
1 ,
2024-05-14 22:56:08 +08:00
0 ,
2024-05-14 16:45:06 +08:00
]
" ###);
}
2024-05-21 23:08:45 +08:00
#[ test ]
fn vectors_are_never_indexed_as_searchable_or_filterable ( ) {
let index = TempIndex ::new ( ) ;
index
. add_documents ( documents! ( [
{ " id " : 0 , " _vectors " : { " doggo " : [ 2345 ] } } ,
{ " id " : 1 , " _vectors " : { " doggo " : [ 6789 ] } } ,
] ) )
. unwrap ( ) ;
db_snap! ( index , fields_ids_map , @ r ###"
0 id |
1 _vectors |
2 _vectors . doggo |
" ###);
db_snap! ( index , searchable_fields , @ r ### "["id"]"### ) ;
db_snap! ( index , fieldids_weights_map , @ r ###"
fid weight
0 0 |
" ###);
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let mut search = index . search ( & rtxn ) ;
let results = search . query ( " 2345 " ) . execute ( ) . unwrap ( ) ;
assert! ( results . candidates . is_empty ( ) ) ;
drop ( rtxn ) ;
index
. update_settings ( | settings | {
settings . set_searchable_fields ( vec! [ S ( " _vectors " ) , S ( " _vectors.doggo " ) ] ) ;
settings . set_filterable_fields ( hashset! [ S ( " _vectors " ) , S ( " _vectors.doggo " ) ] ) ;
} )
. unwrap ( ) ;
db_snap! ( index , fields_ids_map , @ r ###"
0 id |
1 _vectors |
2 _vectors . doggo |
" ###);
db_snap! ( index , searchable_fields , @ " [] " ) ;
db_snap! ( index , fieldids_weights_map , @ r ###"
fid weight
" ###);
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let mut search = index . search ( & rtxn ) ;
let results = search . query ( " 2345 " ) . execute ( ) . unwrap ( ) ;
assert! ( results . candidates . is_empty ( ) ) ;
let mut search = index . search ( & rtxn ) ;
let results = search
. filter ( Filter ::from_str ( " _vectors.doggo = 6789 " ) . unwrap ( ) . unwrap ( ) )
. execute ( )
. unwrap ( ) ;
assert! ( results . candidates . is_empty ( ) ) ;
index
. update_settings ( | settings | {
settings . set_embedder_settings ( btreemap! {
S ( " doggo " ) = > Setting ::Set ( EmbeddingSettings {
dimensions : Setting ::Set ( 1 ) ,
source : Setting ::Set ( EmbedderSource ::UserProvided ) ,
.. EmbeddingSettings ::default ( ) } ) ,
} ) ;
} )
. unwrap ( ) ;
db_snap! ( index , fields_ids_map , @ r ###"
0 id |
1 _vectors |
2 _vectors . doggo |
" ###);
db_snap! ( index , searchable_fields , @ " [] " ) ;
db_snap! ( index , fieldids_weights_map , @ r ###"
fid weight
" ###);
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let mut search = index . search ( & rtxn ) ;
let results = search . query ( " 2345 " ) . execute ( ) . unwrap ( ) ;
assert! ( results . candidates . is_empty ( ) ) ;
let mut search = index . search ( & rtxn ) ;
let results = search
. filter ( Filter ::from_str ( " _vectors.doggo = 6789 " ) . unwrap ( ) . unwrap ( ) )
. execute ( )
. unwrap ( ) ;
assert! ( results . candidates . is_empty ( ) ) ;
}
2021-03-31 23:14:23 +08:00
}