2021-04-28 23:58:16 +08:00
|
|
|
use std::collections::{BTreeSet, HashMap, HashSet};
|
2020-11-12 00:08:18 +08:00
|
|
|
|
2021-01-21 00:27:43 +08:00
|
|
|
use anyhow::Context;
|
2021-03-12 01:42:21 +08:00
|
|
|
use chrono::Utc;
|
2020-11-03 20:20:11 +08:00
|
|
|
use grenad::CompressionType;
|
2021-01-21 00:27:43 +08:00
|
|
|
use itertools::Itertools;
|
2021-04-10 03:56:20 +08:00
|
|
|
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
|
2020-11-03 20:20:11 +08:00
|
|
|
use rayon::ThreadPool;
|
2021-04-07 19:33:44 +08:00
|
|
|
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
2020-11-02 22:31:20 +08:00
|
|
|
|
2021-04-07 19:33:44 +08:00
|
|
|
use crate::{FieldsIdsMap, Index};
|
2020-11-11 19:39:09 +08:00
|
|
|
use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep};
|
2021-04-07 19:33:44 +08:00
|
|
|
use crate::update::index_documents::{IndexDocumentsMethod, Transform};
|
|
|
|
|
2021-04-07 20:06:14 +08:00
|
|
|
#[derive(Debug, Clone, PartialEq)]
|
2021-04-07 19:33:44 +08:00
|
|
|
pub enum Setting<T> {
|
|
|
|
Set(T),
|
|
|
|
Reset,
|
2021-04-07 20:06:14 +08:00
|
|
|
NotSet,
|
2021-04-07 19:33:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
impl<T> Default for Setting<T> {
|
|
|
|
fn default() -> Self {
|
|
|
|
Self::NotSet
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<T> Setting<T> {
|
|
|
|
pub const fn is_not_set(&self) -> bool {
|
|
|
|
matches!(self, Self::NotSet)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<T: Serialize> Serialize for Setting<T> {
|
|
|
|
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> where S: Serializer {
|
|
|
|
match self {
|
|
|
|
Self::Set(value) => Some(value),
|
|
|
|
// Usually not_set isn't serialized by setting skip_serializing_if field attribute
|
|
|
|
Self::NotSet | Self::Reset => None,
|
|
|
|
}.serialize(serializer)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<'de, T: Deserialize<'de>> Deserialize<'de> for Setting<T> {
|
|
|
|
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> where D: Deserializer<'de> {
|
|
|
|
Deserialize::deserialize(deserializer).map(|x| match x {
|
|
|
|
Some(x) => Self::Set(x),
|
|
|
|
None => Self::Reset, // Reset is forced by sending null value
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
2020-11-03 20:20:11 +08:00
|
|
|
|
|
|
|
pub struct Settings<'a, 't, 'u, 'i> {
|
2020-11-02 22:31:20 +08:00
|
|
|
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
|
|
|
index: &'i Index,
|
2020-11-03 20:20:11 +08:00
|
|
|
pub(crate) log_every_n: Option<usize>,
|
|
|
|
pub(crate) max_nb_chunks: Option<usize>,
|
|
|
|
pub(crate) max_memory: Option<usize>,
|
|
|
|
pub(crate) linked_hash_map_size: Option<usize>,
|
|
|
|
pub(crate) chunk_compression_type: CompressionType,
|
|
|
|
pub(crate) chunk_compression_level: Option<u32>,
|
|
|
|
pub(crate) chunk_fusing_shrink_size: Option<u64>,
|
|
|
|
pub(crate) thread_pool: Option<&'a ThreadPool>,
|
2020-12-22 23:21:07 +08:00
|
|
|
update_id: u64,
|
2020-11-03 20:20:11 +08:00
|
|
|
|
2021-04-07 19:33:44 +08:00
|
|
|
searchable_fields: Setting<Vec<String>>,
|
|
|
|
displayed_fields: Setting<Vec<String>>,
|
2021-06-01 18:19:55 +08:00
|
|
|
filterable_fields: Setting<HashSet<String>>,
|
2021-04-07 19:33:44 +08:00
|
|
|
criteria: Setting<Vec<String>>,
|
|
|
|
stop_words: Setting<BTreeSet<String>>,
|
2021-06-01 22:29:14 +08:00
|
|
|
distinct_field: Setting<String>,
|
2021-04-07 16:53:57 +08:00
|
|
|
synonyms: Setting<HashMap<String, Vec<String>>>,
|
2020-11-02 22:31:20 +08:00
|
|
|
}
|
|
|
|
|
2020-11-03 20:20:11 +08:00
|
|
|
impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
2020-12-22 23:21:07 +08:00
|
|
|
pub fn new(
|
|
|
|
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
|
|
|
index: &'i Index,
|
|
|
|
update_id: u64,
|
|
|
|
) -> Settings<'a, 't, 'u, 'i> {
|
2020-11-03 20:20:11 +08:00
|
|
|
Settings {
|
|
|
|
wtxn,
|
|
|
|
index,
|
|
|
|
log_every_n: None,
|
|
|
|
max_nb_chunks: None,
|
|
|
|
max_memory: None,
|
|
|
|
linked_hash_map_size: None,
|
|
|
|
chunk_compression_type: CompressionType::None,
|
|
|
|
chunk_compression_level: None,
|
|
|
|
chunk_fusing_shrink_size: None,
|
|
|
|
thread_pool: None,
|
2021-04-07 19:33:44 +08:00
|
|
|
searchable_fields: Setting::NotSet,
|
|
|
|
displayed_fields: Setting::NotSet,
|
2021-06-01 18:19:55 +08:00
|
|
|
filterable_fields: Setting::NotSet,
|
2021-04-07 19:33:44 +08:00
|
|
|
criteria: Setting::NotSet,
|
|
|
|
stop_words: Setting::NotSet,
|
2021-06-01 22:29:14 +08:00
|
|
|
distinct_field: Setting::NotSet,
|
2021-04-07 16:53:57 +08:00
|
|
|
synonyms: Setting::NotSet,
|
2020-12-22 23:21:07 +08:00
|
|
|
update_id,
|
2020-11-03 20:20:11 +08:00
|
|
|
}
|
2020-11-02 22:31:20 +08:00
|
|
|
}
|
|
|
|
|
2020-11-03 20:42:29 +08:00
|
|
|
pub fn reset_searchable_fields(&mut self) {
|
2021-04-07 19:33:44 +08:00
|
|
|
self.searchable_fields = Setting::Reset;
|
2020-11-03 20:42:29 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
pub fn set_searchable_fields(&mut self, names: Vec<String>) {
|
2021-04-07 19:33:44 +08:00
|
|
|
self.searchable_fields = Setting::Set(names);
|
2020-11-03 20:42:29 +08:00
|
|
|
}
|
|
|
|
|
2020-11-02 22:31:20 +08:00
|
|
|
pub fn reset_displayed_fields(&mut self) {
|
2021-04-07 19:33:44 +08:00
|
|
|
self.displayed_fields = Setting::Reset;
|
2020-11-02 22:31:20 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
pub fn set_displayed_fields(&mut self, names: Vec<String>) {
|
2021-04-07 19:33:44 +08:00
|
|
|
self.displayed_fields = Setting::Set(names);
|
2020-11-02 22:31:20 +08:00
|
|
|
}
|
|
|
|
|
2021-06-01 18:19:55 +08:00
|
|
|
pub fn reset_filterable_fields(&mut self) {
|
|
|
|
self.filterable_fields = Setting::Reset;
|
2021-01-21 00:27:43 +08:00
|
|
|
}
|
|
|
|
|
2021-06-01 18:19:55 +08:00
|
|
|
pub fn set_filterable_fields(&mut self, names: HashSet<String>) {
|
|
|
|
self.filterable_fields = Setting::Set(names);
|
2020-11-12 00:08:18 +08:00
|
|
|
}
|
|
|
|
|
2020-12-04 19:02:22 +08:00
|
|
|
pub fn reset_criteria(&mut self) {
|
2021-04-07 19:33:44 +08:00
|
|
|
self.criteria = Setting::Reset;
|
2020-12-04 19:02:22 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
pub fn set_criteria(&mut self, criteria: Vec<String>) {
|
2021-04-07 19:33:44 +08:00
|
|
|
self.criteria = Setting::Set(criteria);
|
2020-12-04 19:02:22 +08:00
|
|
|
}
|
|
|
|
|
2021-03-30 01:15:47 +08:00
|
|
|
pub fn reset_stop_words(&mut self) {
|
2021-04-07 19:33:44 +08:00
|
|
|
self.stop_words = Setting::Reset;
|
2021-03-30 01:15:47 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
pub fn set_stop_words(&mut self, stop_words: BTreeSet<String>) {
|
|
|
|
self.stop_words = if stop_words.is_empty() {
|
2021-04-07 19:33:44 +08:00
|
|
|
Setting::Reset
|
2021-03-30 01:15:47 +08:00
|
|
|
} else {
|
2021-04-07 19:33:44 +08:00
|
|
|
Setting::Set(stop_words)
|
2021-03-30 01:15:47 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-06-01 22:29:14 +08:00
|
|
|
pub fn reset_distinct_field(&mut self) {
|
|
|
|
self.distinct_field = Setting::Reset;
|
2021-04-07 16:53:57 +08:00
|
|
|
}
|
|
|
|
|
2021-06-01 22:29:14 +08:00
|
|
|
pub fn set_distinct_field(&mut self, distinct_field: String) {
|
|
|
|
self.distinct_field = Setting::Set(distinct_field);
|
2021-04-07 18:38:48 +08:00
|
|
|
}
|
|
|
|
|
2021-04-07 16:53:57 +08:00
|
|
|
pub fn reset_synonyms(&mut self) {
|
|
|
|
self.synonyms = Setting::Reset;
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn set_synonyms(&mut self, synonyms: HashMap<String, Vec<String>>) {
|
|
|
|
self.synonyms = if synonyms.is_empty() {
|
|
|
|
Setting::Reset
|
|
|
|
} else {
|
|
|
|
Setting::Set(synonyms)
|
|
|
|
}
|
2021-04-07 18:38:48 +08:00
|
|
|
}
|
|
|
|
|
2021-01-21 00:27:43 +08:00
|
|
|
fn reindex<F>(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> anyhow::Result<()>
|
2021-06-01 22:29:14 +08:00
|
|
|
where
|
|
|
|
F: Fn(UpdateIndexingStep, u64) + Sync
|
2020-11-03 20:20:11 +08:00
|
|
|
{
|
2020-11-14 05:35:02 +08:00
|
|
|
let fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
|
2020-12-22 23:21:07 +08:00
|
|
|
let update_id = self.update_id;
|
|
|
|
let cb = |step| cb(step, update_id);
|
2021-01-21 00:27:43 +08:00
|
|
|
// if the settings are set before any document update, we don't need to do anything, and
|
|
|
|
// will set the primary key during the first document addition.
|
|
|
|
if self.index.number_of_documents(&self.wtxn)? == 0 {
|
2021-04-07 19:33:44 +08:00
|
|
|
return Ok(());
|
2021-01-21 00:27:43 +08:00
|
|
|
}
|
2020-11-14 05:35:02 +08:00
|
|
|
|
2021-01-21 00:27:43 +08:00
|
|
|
let transform = Transform {
|
|
|
|
rtxn: &self.wtxn,
|
|
|
|
index: self.index,
|
|
|
|
log_every_n: self.log_every_n,
|
|
|
|
chunk_compression_type: self.chunk_compression_type,
|
|
|
|
chunk_compression_level: self.chunk_compression_level,
|
|
|
|
chunk_fusing_shrink_size: self.chunk_fusing_shrink_size,
|
|
|
|
max_nb_chunks: self.max_nb_chunks,
|
|
|
|
max_memory: self.max_memory,
|
|
|
|
index_documents_method: IndexDocumentsMethod::ReplaceDocuments,
|
|
|
|
autogenerate_docids: false,
|
2020-11-14 05:35:02 +08:00
|
|
|
};
|
|
|
|
|
2021-01-21 00:27:43 +08:00
|
|
|
// There already has been a document addition, the primary key should be set by now.
|
|
|
|
let primary_key = self.index.primary_key(&self.wtxn)?.context("Index must have a primary key")?;
|
|
|
|
|
|
|
|
// We remap the documents fields based on the new `FieldsIdsMap`.
|
|
|
|
let output = transform.remap_index_documents(
|
|
|
|
primary_key.to_string(),
|
|
|
|
old_fields_ids_map,
|
2021-06-01 22:29:14 +08:00
|
|
|
fields_ids_map.clone(),
|
|
|
|
)?;
|
2021-01-21 00:27:43 +08:00
|
|
|
|
|
|
|
// We clear the full database (words-fst, documents ids and documents content).
|
2020-12-22 23:21:07 +08:00
|
|
|
ClearDocuments::new(self.wtxn, self.index, self.update_id).execute()?;
|
2021-01-21 00:27:43 +08:00
|
|
|
|
|
|
|
// We index the generated `TransformOutput` which must contain
|
|
|
|
// all the documents with fields in the newly defined searchable order.
|
2020-12-22 23:21:07 +08:00
|
|
|
let mut indexing_builder = IndexDocuments::new(self.wtxn, self.index, self.update_id);
|
2021-01-21 00:27:43 +08:00
|
|
|
indexing_builder.log_every_n = self.log_every_n;
|
|
|
|
indexing_builder.max_nb_chunks = self.max_nb_chunks;
|
|
|
|
indexing_builder.max_memory = self.max_memory;
|
|
|
|
indexing_builder.linked_hash_map_size = self.linked_hash_map_size;
|
|
|
|
indexing_builder.chunk_compression_type = self.chunk_compression_type;
|
|
|
|
indexing_builder.chunk_compression_level = self.chunk_compression_level;
|
|
|
|
indexing_builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
|
|
|
|
indexing_builder.thread_pool = self.thread_pool;
|
|
|
|
indexing_builder.execute_raw(output, &cb)?;
|
2021-06-01 22:29:14 +08:00
|
|
|
|
2021-01-21 00:27:43 +08:00
|
|
|
Ok(())
|
|
|
|
}
|
2020-11-12 00:08:18 +08:00
|
|
|
|
2021-01-21 00:27:43 +08:00
|
|
|
fn update_displayed(&mut self) -> anyhow::Result<bool> {
|
|
|
|
match self.displayed_fields {
|
2021-04-07 19:33:44 +08:00
|
|
|
Setting::Set(ref fields) => {
|
2021-01-21 00:27:43 +08:00
|
|
|
let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
|
|
|
|
// fields are deduplicated, only the first occurrence is taken into account
|
|
|
|
let names: Vec<_> = fields
|
|
|
|
.iter()
|
|
|
|
.unique()
|
|
|
|
.map(String::as_str)
|
|
|
|
.collect();
|
|
|
|
|
|
|
|
for name in names.iter() {
|
|
|
|
fields_ids_map
|
|
|
|
.insert(name)
|
|
|
|
.context("field id limit exceeded")?;
|
2020-11-14 05:35:02 +08:00
|
|
|
}
|
2021-01-21 00:27:43 +08:00
|
|
|
self.index.put_displayed_fields(self.wtxn, &names)?;
|
|
|
|
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
|
2020-11-14 05:35:02 +08:00
|
|
|
}
|
2021-04-07 19:33:44 +08:00
|
|
|
Setting::Reset => { self.index.delete_displayed_fields(self.wtxn)?; }
|
|
|
|
Setting::NotSet => return Ok(false),
|
2020-11-14 05:35:02 +08:00
|
|
|
}
|
2021-01-21 00:27:43 +08:00
|
|
|
Ok(true)
|
|
|
|
}
|
2020-11-03 20:20:11 +08:00
|
|
|
|
2021-06-01 22:29:14 +08:00
|
|
|
fn update_distinct_field(&mut self) -> anyhow::Result<bool> {
|
|
|
|
match self.distinct_field {
|
2021-04-07 18:38:48 +08:00
|
|
|
Setting::Set(ref attr) => {
|
|
|
|
let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
|
|
|
|
fields_ids_map
|
|
|
|
.insert(attr)
|
|
|
|
.context("field id limit exceeded")?;
|
|
|
|
|
2021-06-01 22:29:14 +08:00
|
|
|
self.index.put_distinct_field(self.wtxn, &attr)?;
|
2021-04-07 18:38:48 +08:00
|
|
|
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
|
|
|
|
}
|
2021-06-01 22:29:14 +08:00
|
|
|
Setting::Reset => { self.index.delete_distinct_field(self.wtxn)?; },
|
2021-04-07 18:38:48 +08:00
|
|
|
Setting::NotSet => return Ok(false),
|
|
|
|
}
|
|
|
|
Ok(true)
|
|
|
|
}
|
|
|
|
|
2021-03-31 23:14:23 +08:00
|
|
|
/// Updates the index's searchable attributes. This causes the field map to be recomputed to
|
2021-01-21 00:27:43 +08:00
|
|
|
/// reflect the order of the searchable attributes.
|
|
|
|
fn update_searchable(&mut self) -> anyhow::Result<bool> {
|
|
|
|
match self.searchable_fields {
|
2021-04-07 19:33:44 +08:00
|
|
|
Setting::Set(ref fields) => {
|
2021-01-21 00:27:43 +08:00
|
|
|
// every time the searchable attributes are updated, we need to update the
|
|
|
|
// ids for any settings that uses the facets. (displayed_fields,
|
2021-06-01 18:19:55 +08:00
|
|
|
// filterable_fields)
|
2021-01-21 00:27:43 +08:00
|
|
|
let old_fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
|
2020-12-04 19:02:22 +08:00
|
|
|
|
2021-01-21 00:27:43 +08:00
|
|
|
let mut new_fields_ids_map = FieldsIdsMap::new();
|
|
|
|
// fields are deduplicated, only the first occurrence is taken into account
|
|
|
|
let names = fields
|
|
|
|
.iter()
|
|
|
|
.unique()
|
|
|
|
.map(String::as_str)
|
|
|
|
.collect::<Vec<_>>();
|
|
|
|
|
|
|
|
// Add all the searchable attributes to the field map, and then add the
|
|
|
|
// remaining fields from the old field map to the new one
|
|
|
|
for name in names.iter() {
|
|
|
|
new_fields_ids_map
|
|
|
|
.insert(&name)
|
|
|
|
.context("field id limit exceeded")?;
|
|
|
|
}
|
2020-11-03 20:20:11 +08:00
|
|
|
|
2021-01-21 00:27:43 +08:00
|
|
|
for (_, name) in old_fields_ids_map.iter() {
|
|
|
|
new_fields_ids_map
|
|
|
|
.insert(&name)
|
|
|
|
.context("field id limit exceeded")?;
|
2020-11-14 05:35:02 +08:00
|
|
|
}
|
2020-11-03 20:20:11 +08:00
|
|
|
|
2021-01-21 00:27:43 +08:00
|
|
|
self.index.put_searchable_fields(self.wtxn, &names)?;
|
|
|
|
self.index.put_fields_ids_map(self.wtxn, &new_fields_ids_map)?;
|
|
|
|
}
|
2021-04-07 19:33:44 +08:00
|
|
|
Setting::Reset => { self.index.delete_searchable_fields(self.wtxn)?; }
|
|
|
|
Setting::NotSet => return Ok(false),
|
2020-11-03 20:20:11 +08:00
|
|
|
}
|
2021-01-21 00:27:43 +08:00
|
|
|
Ok(true)
|
|
|
|
}
|
2020-11-03 20:20:11 +08:00
|
|
|
|
2021-03-30 01:15:47 +08:00
|
|
|
fn update_stop_words(&mut self) -> anyhow::Result<bool> {
|
|
|
|
match self.stop_words {
|
2021-04-07 19:33:44 +08:00
|
|
|
Setting::Set(ref stop_words) => {
|
2021-03-30 01:15:47 +08:00
|
|
|
let current = self.index.stop_words(self.wtxn)?;
|
|
|
|
// since we can't compare a BTreeSet with an FST we are going to convert the
|
|
|
|
// BTreeSet to an FST and then compare bytes per bytes the two FSTs.
|
2021-04-07 16:53:57 +08:00
|
|
|
let fst = fst::Set::from_iter(stop_words)?;
|
2021-03-30 01:15:47 +08:00
|
|
|
|
|
|
|
// Does the new FST differ from the previous one?
|
|
|
|
if current.map_or(true, |current| current.as_fst().as_bytes() != fst.as_fst().as_bytes()) {
|
|
|
|
// we want to re-create our FST.
|
|
|
|
self.index.put_stop_words(self.wtxn, &fst)?;
|
|
|
|
Ok(true)
|
|
|
|
} else {
|
|
|
|
Ok(false)
|
|
|
|
}
|
|
|
|
}
|
2021-04-07 19:33:44 +08:00
|
|
|
Setting::Reset => Ok(self.index.delete_stop_words(self.wtxn)?),
|
|
|
|
Setting::NotSet => Ok(false),
|
2021-03-30 01:15:47 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-04-07 16:53:57 +08:00
|
|
|
fn update_synonyms(&mut self) -> anyhow::Result<bool> {
|
|
|
|
match self.synonyms {
|
|
|
|
Setting::Set(ref synonyms) => {
|
2021-04-10 03:56:20 +08:00
|
|
|
fn normalize(analyzer: &Analyzer<&[u8]>, text: &str) -> Vec<String> {
|
|
|
|
analyzer
|
|
|
|
.analyze(text)
|
|
|
|
.tokens()
|
|
|
|
.filter_map(|token|
|
|
|
|
if token.is_word() { Some(token.text().to_string()) } else { None }
|
|
|
|
)
|
|
|
|
.collect::<Vec<_>>()
|
|
|
|
}
|
2021-04-07 16:53:57 +08:00
|
|
|
|
|
|
|
let mut config = AnalyzerConfig::default();
|
|
|
|
let stop_words = self.index.stop_words(self.wtxn)?;
|
|
|
|
if let Some(stop_words) = &stop_words {
|
|
|
|
config.stop_words(stop_words);
|
|
|
|
}
|
|
|
|
let analyzer = Analyzer::new(config);
|
|
|
|
|
2021-04-10 03:56:20 +08:00
|
|
|
let mut new_synonyms = HashMap::new();
|
|
|
|
for (word, synonyms) in synonyms {
|
|
|
|
// Normalize both the word and associated synonyms.
|
|
|
|
let normalized_word = normalize(&analyzer, word);
|
|
|
|
let normalized_synonyms = synonyms
|
|
|
|
.iter()
|
|
|
|
.map(|synonym| normalize(&analyzer, synonym));
|
|
|
|
|
|
|
|
// Store the normalized synonyms under the normalized word,
|
|
|
|
// merging the possible duplicate words.
|
|
|
|
let entry = new_synonyms
|
|
|
|
.entry(normalized_word)
|
|
|
|
.or_insert_with(Vec::new);
|
|
|
|
entry.extend(normalized_synonyms);
|
|
|
|
}
|
2021-04-07 16:53:57 +08:00
|
|
|
|
2021-04-10 03:56:20 +08:00
|
|
|
// Make sure that we don't have duplicate synonyms.
|
|
|
|
new_synonyms
|
|
|
|
.iter_mut()
|
|
|
|
.for_each(|(_, synonyms)| {
|
|
|
|
synonyms.sort_unstable();
|
|
|
|
synonyms.dedup();
|
|
|
|
});
|
|
|
|
|
|
|
|
let old_synonyms = self.index.synonyms(self.wtxn)?;
|
2021-04-07 16:53:57 +08:00
|
|
|
|
|
|
|
if new_synonyms != old_synonyms {
|
|
|
|
self.index.put_synonyms(self.wtxn, &new_synonyms)?;
|
|
|
|
Ok(true)
|
|
|
|
} else {
|
|
|
|
Ok(false)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Setting::Reset => Ok(self.index.delete_synonyms(self.wtxn)?),
|
|
|
|
Setting::NotSet => Ok(false),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-06-01 22:29:14 +08:00
|
|
|
fn update_filterable(&mut self) -> anyhow::Result<()> {
|
2021-06-01 18:19:55 +08:00
|
|
|
match self.filterable_fields {
|
2021-04-07 19:33:44 +08:00
|
|
|
Setting::Set(ref fields) => {
|
2021-01-21 00:27:43 +08:00
|
|
|
let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
|
2021-04-28 23:58:16 +08:00
|
|
|
let mut new_facets = HashSet::new();
|
|
|
|
for name in fields {
|
2021-01-21 00:27:43 +08:00
|
|
|
fields_ids_map.insert(name).context("field id limit exceeded")?;
|
2021-04-28 23:58:16 +08:00
|
|
|
new_facets.insert(name.clone());
|
2021-01-21 00:27:43 +08:00
|
|
|
}
|
2021-06-01 18:19:55 +08:00
|
|
|
self.index.put_filterable_fields(self.wtxn, &new_facets)?;
|
2021-01-21 00:27:43 +08:00
|
|
|
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
|
2020-11-02 22:31:20 +08:00
|
|
|
}
|
2021-06-01 18:19:55 +08:00
|
|
|
Setting::Reset => { self.index.delete_filterable_fields(self.wtxn)?; }
|
2021-06-01 22:29:14 +08:00
|
|
|
Setting::NotSet => (),
|
2020-11-02 22:31:20 +08:00
|
|
|
}
|
2021-06-01 22:29:14 +08:00
|
|
|
Ok(())
|
2021-01-21 00:27:43 +08:00
|
|
|
}
|
2020-11-02 22:31:20 +08:00
|
|
|
|
2021-01-21 00:27:43 +08:00
|
|
|
fn update_criteria(&mut self) -> anyhow::Result<()> {
|
|
|
|
match self.criteria {
|
2021-04-07 19:33:44 +08:00
|
|
|
Setting::Set(ref fields) => {
|
2021-01-21 00:27:43 +08:00
|
|
|
let mut new_criteria = Vec::new();
|
|
|
|
for name in fields {
|
2021-06-01 21:48:38 +08:00
|
|
|
let criterion = name.parse()?;
|
2021-01-21 00:27:43 +08:00
|
|
|
new_criteria.push(criterion);
|
|
|
|
}
|
|
|
|
self.index.put_criteria(self.wtxn, &new_criteria)?;
|
2020-12-04 19:02:22 +08:00
|
|
|
}
|
2021-04-07 19:33:44 +08:00
|
|
|
Setting::Reset => { self.index.delete_criteria(self.wtxn)?; }
|
|
|
|
Setting::NotSet => (),
|
2020-12-04 19:02:22 +08:00
|
|
|
}
|
2020-11-02 22:31:20 +08:00
|
|
|
Ok(())
|
|
|
|
}
|
2021-01-21 00:27:43 +08:00
|
|
|
|
|
|
|
pub fn execute<F>(mut self, progress_callback: F) -> anyhow::Result<()>
|
2021-04-07 19:33:44 +08:00
|
|
|
where
|
|
|
|
F: Fn(UpdateIndexingStep, u64) + Sync
|
2021-03-30 01:15:47 +08:00
|
|
|
{
|
|
|
|
self.index.set_updated_at(self.wtxn, &Utc::now())?;
|
2021-06-01 22:29:14 +08:00
|
|
|
|
|
|
|
let old_faceted_fields = self.index.faceted_fields(&self.wtxn)?;
|
2021-03-30 01:15:47 +08:00
|
|
|
let old_fields_ids_map = self.index.fields_ids_map(&self.wtxn)?;
|
2021-06-01 22:29:14 +08:00
|
|
|
|
2021-03-30 01:15:47 +08:00
|
|
|
self.update_displayed()?;
|
2021-06-01 22:29:14 +08:00
|
|
|
self.update_filterable()?;
|
|
|
|
self.update_distinct_field()?;
|
2021-03-30 01:15:47 +08:00
|
|
|
self.update_criteria()?;
|
2021-06-01 22:29:14 +08:00
|
|
|
|
|
|
|
// If there is new faceted fields we indicate that we must reindex as we must
|
|
|
|
// index new fields as facets. It means that the distinct attribute,
|
|
|
|
// an Asc/Desc criterion or a filtered attribute as be added or removed.
|
|
|
|
let new_faceted_fields = self.index.faceted_fields(&self.wtxn)?;
|
|
|
|
let faceted_updated = old_faceted_fields != new_faceted_fields;
|
|
|
|
|
|
|
|
let stop_words_updated = self.update_stop_words()?;
|
2021-04-07 16:53:57 +08:00
|
|
|
let synonyms_updated = self.update_synonyms()?;
|
2021-03-30 01:15:47 +08:00
|
|
|
let searchable_updated = self.update_searchable()?;
|
|
|
|
|
2021-06-01 22:29:14 +08:00
|
|
|
if stop_words_updated || faceted_updated || synonyms_updated || searchable_updated {
|
2021-03-30 01:15:47 +08:00
|
|
|
self.reindex(&progress_callback, old_fields_ids_map)?;
|
2021-01-21 00:27:43 +08:00
|
|
|
}
|
2021-06-01 22:29:14 +08:00
|
|
|
|
2021-03-30 01:15:47 +08:00
|
|
|
Ok(())
|
|
|
|
}
|
2020-11-02 22:31:20 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
|
|
|
use heed::EnvOpenOptions;
|
2021-05-03 21:58:47 +08:00
|
|
|
use heed::types::ByteSlice;
|
|
|
|
use maplit::{btreeset, hashmap, hashset};
|
|
|
|
use big_s::S;
|
2020-11-02 22:31:20 +08:00
|
|
|
|
2021-06-01 22:29:14 +08:00
|
|
|
use crate::{Criterion, FilterCondition, SearchResult};
|
2021-01-21 00:27:43 +08:00
|
|
|
use crate::update::{IndexDocuments, UpdateFormat};
|
|
|
|
|
2021-04-07 19:33:44 +08:00
|
|
|
use super::*;
|
|
|
|
|
2020-11-04 01:22:40 +08:00
|
|
|
#[test]
|
|
|
|
fn set_and_reset_searchable_fields() {
|
|
|
|
let path = tempfile::tempdir().unwrap();
|
|
|
|
let mut options = EnvOpenOptions::new();
|
|
|
|
options.map_size(10 * 1024 * 1024); // 10 MB
|
|
|
|
let index = Index::new(options, &path).unwrap();
|
|
|
|
|
|
|
|
// First we send 3 documents with ids from 1 to 3.
|
|
|
|
let mut wtxn = index.write_txn().unwrap();
|
2021-03-06 19:48:41 +08:00
|
|
|
let content = &b"id,name,age\n0,kevin,23\n1,kevina,21\n2,benoit,34\n"[..];
|
2020-12-22 23:21:07 +08:00
|
|
|
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
2020-11-04 01:22:40 +08:00
|
|
|
builder.update_format(UpdateFormat::Csv);
|
2020-12-22 23:21:07 +08:00
|
|
|
builder.execute(content, |_, _| ()).unwrap();
|
2020-11-04 01:22:40 +08:00
|
|
|
wtxn.commit().unwrap();
|
|
|
|
|
|
|
|
// We change the searchable fields to be the "name" field only.
|
|
|
|
let mut wtxn = index.write_txn().unwrap();
|
2020-12-22 23:21:07 +08:00
|
|
|
let mut builder = Settings::new(&mut wtxn, &index, 1);
|
2020-11-04 01:22:40 +08:00
|
|
|
builder.set_searchable_fields(vec!["name".into()]);
|
2020-12-22 23:21:07 +08:00
|
|
|
builder.execute(|_, _| ()).unwrap();
|
2020-11-04 01:22:40 +08:00
|
|
|
wtxn.commit().unwrap();
|
|
|
|
|
|
|
|
// Check that the searchable field is correctly set to "name" only.
|
|
|
|
let rtxn = index.read_txn().unwrap();
|
|
|
|
// When we search for something that is not in
|
|
|
|
// the searchable fields it must not return any document.
|
|
|
|
let result = index.search(&rtxn).query("23").execute().unwrap();
|
|
|
|
assert!(result.documents_ids.is_empty());
|
|
|
|
|
|
|
|
// When we search for something that is in the searchable fields
|
|
|
|
// we must find the appropriate document.
|
|
|
|
let result = index.search(&rtxn).query(r#""kevin""#).execute().unwrap();
|
|
|
|
let documents = index.documents(&rtxn, result.documents_ids).unwrap();
|
|
|
|
assert_eq!(documents.len(), 1);
|
|
|
|
assert_eq!(documents[0].1.get(0), Some(&br#""kevin""#[..]));
|
|
|
|
drop(rtxn);
|
|
|
|
|
|
|
|
// We change the searchable fields to be the "name" field only.
|
|
|
|
let mut wtxn = index.write_txn().unwrap();
|
2020-12-22 23:21:07 +08:00
|
|
|
let mut builder = Settings::new(&mut wtxn, &index, 2);
|
2020-11-04 01:22:40 +08:00
|
|
|
builder.reset_searchable_fields();
|
2020-12-22 23:21:07 +08:00
|
|
|
builder.execute(|_, _| ()).unwrap();
|
2020-11-04 01:22:40 +08:00
|
|
|
wtxn.commit().unwrap();
|
|
|
|
|
|
|
|
// Check that the searchable field have been reset and documents are found now.
|
|
|
|
let rtxn = index.read_txn().unwrap();
|
|
|
|
let searchable_fields = index.searchable_fields(&rtxn).unwrap();
|
|
|
|
assert_eq!(searchable_fields, None);
|
|
|
|
let result = index.search(&rtxn).query("23").execute().unwrap();
|
|
|
|
assert_eq!(result.documents_ids.len(), 1);
|
|
|
|
let documents = index.documents(&rtxn, result.documents_ids).unwrap();
|
|
|
|
assert_eq!(documents[0].1.get(0), Some(&br#""kevin""#[..]));
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn mixup_searchable_with_displayed_fields() {
|
|
|
|
let path = tempfile::tempdir().unwrap();
|
|
|
|
let mut options = EnvOpenOptions::new();
|
|
|
|
options.map_size(10 * 1024 * 1024); // 10 MB
|
|
|
|
let index = Index::new(options, &path).unwrap();
|
|
|
|
|
|
|
|
// First we send 3 documents with ids from 1 to 3.
|
|
|
|
let mut wtxn = index.write_txn().unwrap();
|
|
|
|
let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
|
2020-12-22 23:21:07 +08:00
|
|
|
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
2021-05-01 02:34:29 +08:00
|
|
|
builder.enable_autogenerate_docids();
|
2020-11-04 01:22:40 +08:00
|
|
|
builder.update_format(UpdateFormat::Csv);
|
2020-12-22 23:21:07 +08:00
|
|
|
builder.execute(content, |_, _| ()).unwrap();
|
2020-11-04 01:22:40 +08:00
|
|
|
wtxn.commit().unwrap();
|
|
|
|
|
|
|
|
// In the same transaction we change the displayed fields to be only the "age".
|
|
|
|
// We also change the searchable fields to be the "name" field only.
|
|
|
|
let mut wtxn = index.write_txn().unwrap();
|
2020-12-22 23:21:07 +08:00
|
|
|
let mut builder = Settings::new(&mut wtxn, &index, 1);
|
2020-11-04 01:22:40 +08:00
|
|
|
builder.set_displayed_fields(vec!["age".into()]);
|
|
|
|
builder.set_searchable_fields(vec!["name".into()]);
|
2020-12-22 23:21:07 +08:00
|
|
|
builder.execute(|_, _| ()).unwrap();
|
2020-11-04 01:22:40 +08:00
|
|
|
wtxn.commit().unwrap();
|
|
|
|
|
|
|
|
// Check that the displayed fields are correctly set to `None` (default value).
|
|
|
|
let rtxn = index.read_txn().unwrap();
|
|
|
|
let fields_ids = index.displayed_fields(&rtxn).unwrap();
|
2021-01-21 00:27:43 +08:00
|
|
|
assert_eq!(fields_ids.unwrap(), (&["age"][..]));
|
2020-11-04 01:22:40 +08:00
|
|
|
drop(rtxn);
|
|
|
|
|
|
|
|
// We change the searchable fields to be the "name" field only.
|
|
|
|
let mut wtxn = index.write_txn().unwrap();
|
2020-12-22 23:21:07 +08:00
|
|
|
let mut builder = Settings::new(&mut wtxn, &index, 2);
|
2020-11-04 01:22:40 +08:00
|
|
|
builder.reset_searchable_fields();
|
2020-12-22 23:21:07 +08:00
|
|
|
builder.execute(|_, _| ()).unwrap();
|
2020-11-04 01:22:40 +08:00
|
|
|
wtxn.commit().unwrap();
|
|
|
|
|
|
|
|
// Check that the displayed fields always contains only the "age" field.
|
|
|
|
let rtxn = index.read_txn().unwrap();
|
|
|
|
let fields_ids = index.displayed_fields(&rtxn).unwrap();
|
2021-01-21 00:27:43 +08:00
|
|
|
assert_eq!(fields_ids.unwrap(), &["age"][..]);
|
2020-11-04 01:22:40 +08:00
|
|
|
}
|
|
|
|
|
2020-11-02 22:31:20 +08:00
|
|
|
#[test]
|
|
|
|
fn default_displayed_fields() {
|
|
|
|
let path = tempfile::tempdir().unwrap();
|
|
|
|
let mut options = EnvOpenOptions::new();
|
|
|
|
options.map_size(10 * 1024 * 1024); // 10 MB
|
|
|
|
let index = Index::new(options, &path).unwrap();
|
|
|
|
|
|
|
|
// First we send 3 documents with ids from 1 to 3.
|
|
|
|
let mut wtxn = index.write_txn().unwrap();
|
|
|
|
let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
|
2020-12-22 23:21:07 +08:00
|
|
|
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
2021-05-01 02:34:29 +08:00
|
|
|
builder.enable_autogenerate_docids();
|
2020-11-02 22:31:20 +08:00
|
|
|
builder.update_format(UpdateFormat::Csv);
|
2020-12-22 23:21:07 +08:00
|
|
|
builder.execute(content, |_, _| ()).unwrap();
|
2020-11-02 22:31:20 +08:00
|
|
|
wtxn.commit().unwrap();
|
|
|
|
|
|
|
|
// Check that the displayed fields are correctly set to `None` (default value).
|
|
|
|
let rtxn = index.read_txn().unwrap();
|
|
|
|
let fields_ids = index.displayed_fields(&rtxn).unwrap();
|
|
|
|
assert_eq!(fields_ids, None);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn set_and_reset_displayed_field() {
|
|
|
|
let path = tempfile::tempdir().unwrap();
|
|
|
|
let mut options = EnvOpenOptions::new();
|
|
|
|
options.map_size(10 * 1024 * 1024); // 10 MB
|
|
|
|
let index = Index::new(options, &path).unwrap();
|
|
|
|
|
|
|
|
// First we send 3 documents with ids from 1 to 3.
|
|
|
|
let mut wtxn = index.write_txn().unwrap();
|
|
|
|
let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
|
2020-12-22 23:21:07 +08:00
|
|
|
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
2021-05-01 02:34:29 +08:00
|
|
|
builder.enable_autogenerate_docids();
|
2020-11-02 22:31:20 +08:00
|
|
|
builder.update_format(UpdateFormat::Csv);
|
2020-12-22 23:21:07 +08:00
|
|
|
builder.execute(content, |_, _| ()).unwrap();
|
2020-11-02 22:31:20 +08:00
|
|
|
|
|
|
|
// In the same transaction we change the displayed fields to be only the age.
|
2020-12-22 23:21:07 +08:00
|
|
|
let mut builder = Settings::new(&mut wtxn, &index, 0);
|
2020-11-02 22:31:20 +08:00
|
|
|
builder.set_displayed_fields(vec!["age".into()]);
|
2020-12-22 23:21:07 +08:00
|
|
|
builder.execute(|_, _| ()).unwrap();
|
2020-11-02 22:31:20 +08:00
|
|
|
wtxn.commit().unwrap();
|
|
|
|
|
|
|
|
// Check that the displayed fields are correctly set to only the "age" field.
|
|
|
|
let rtxn = index.read_txn().unwrap();
|
|
|
|
let fields_ids = index.displayed_fields(&rtxn).unwrap();
|
2021-01-21 00:27:43 +08:00
|
|
|
assert_eq!(fields_ids.unwrap(), &["age"][..]);
|
2020-11-02 22:31:20 +08:00
|
|
|
drop(rtxn);
|
|
|
|
|
|
|
|
// We reset the fields ids to become `None`, the default value.
|
|
|
|
let mut wtxn = index.write_txn().unwrap();
|
2020-12-22 23:21:07 +08:00
|
|
|
let mut builder = Settings::new(&mut wtxn, &index, 0);
|
2020-11-02 22:31:20 +08:00
|
|
|
builder.reset_displayed_fields();
|
2020-12-22 23:21:07 +08:00
|
|
|
builder.execute(|_, _| ()).unwrap();
|
2020-11-02 22:31:20 +08:00
|
|
|
wtxn.commit().unwrap();
|
|
|
|
|
|
|
|
// Check that the displayed fields are correctly set to `None` (default value).
|
|
|
|
let rtxn = index.read_txn().unwrap();
|
|
|
|
let fields_ids = index.displayed_fields(&rtxn).unwrap();
|
|
|
|
assert_eq!(fields_ids, None);
|
|
|
|
}
|
2020-11-13 21:49:48 +08:00
|
|
|
|
|
|
|
#[test]
|
2021-06-01 18:19:55 +08:00
|
|
|
fn set_filterable_fields() {
|
2020-11-13 21:49:48 +08:00
|
|
|
let path = tempfile::tempdir().unwrap();
|
|
|
|
let mut options = EnvOpenOptions::new();
|
|
|
|
options.map_size(10 * 1024 * 1024); // 10 MB
|
|
|
|
let index = Index::new(options, &path).unwrap();
|
|
|
|
|
2021-06-01 18:19:55 +08:00
|
|
|
// Set the filterable fields to be the age.
|
2020-11-13 21:49:48 +08:00
|
|
|
let mut wtxn = index.write_txn().unwrap();
|
2020-12-22 23:21:07 +08:00
|
|
|
let mut builder = Settings::new(&mut wtxn, &index, 0);
|
2021-06-01 18:19:55 +08:00
|
|
|
builder.set_filterable_fields(hashset!{ S("age") });
|
2020-12-22 23:21:07 +08:00
|
|
|
builder.execute(|_, _| ()).unwrap();
|
2020-11-13 21:49:48 +08:00
|
|
|
|
|
|
|
// Then index some documents.
|
2021-05-03 21:58:47 +08:00
|
|
|
let content = &br#"[
|
|
|
|
{ "name": "kevin", "age": 23 },
|
|
|
|
{ "name": "kevina", "age": 21 },
|
|
|
|
{ "name": "benoit", "age": 34 }
|
|
|
|
]"#[..];
|
2020-12-22 23:21:07 +08:00
|
|
|
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
|
2021-05-03 21:58:47 +08:00
|
|
|
builder.update_format(UpdateFormat::Json);
|
2021-05-01 02:34:29 +08:00
|
|
|
builder.enable_autogenerate_docids();
|
2020-12-22 23:21:07 +08:00
|
|
|
builder.execute(content, |_, _| ()).unwrap();
|
2020-11-13 21:49:48 +08:00
|
|
|
wtxn.commit().unwrap();
|
|
|
|
|
|
|
|
// Check that the displayed fields are correctly set.
|
|
|
|
let rtxn = index.read_txn().unwrap();
|
2021-06-01 18:19:55 +08:00
|
|
|
let fields_ids = index.filterable_fields(&rtxn).unwrap();
|
2021-05-03 21:58:47 +08:00
|
|
|
assert_eq!(fields_ids, hashset!{ S("age") });
|
2020-11-18 23:29:07 +08:00
|
|
|
// Only count the field_id 0 and level 0 facet values.
|
2021-05-03 21:58:47 +08:00
|
|
|
// TODO we must support typed CSVs for numbers to be understood.
|
|
|
|
let count = index.facet_id_f64_docids
|
|
|
|
.remap_key_type::<ByteSlice>()
|
|
|
|
.prefix_iter(&rtxn, &[0, 0]).unwrap().count();
|
2020-11-15 18:06:51 +08:00
|
|
|
assert_eq!(count, 3);
|
|
|
|
drop(rtxn);
|
|
|
|
|
|
|
|
// Index a little more documents with new and current facets values.
|
|
|
|
let mut wtxn = index.write_txn().unwrap();
|
2021-05-03 21:58:47 +08:00
|
|
|
let content = &br#"[
|
|
|
|
{ "name": "kevin2", "age": 23 },
|
|
|
|
{ "name": "kevina2", "age": 21 },
|
|
|
|
{ "name": "benoit", "age": 35 }
|
|
|
|
]"#[..];
|
|
|
|
|
2020-12-22 23:21:07 +08:00
|
|
|
let mut builder = IndexDocuments::new(&mut wtxn, &index, 2);
|
2021-05-03 21:58:47 +08:00
|
|
|
builder.enable_autogenerate_docids();
|
|
|
|
builder.update_format(UpdateFormat::Json);
|
2020-12-22 23:21:07 +08:00
|
|
|
builder.execute(content, |_, _| ()).unwrap();
|
2020-11-15 18:06:51 +08:00
|
|
|
wtxn.commit().unwrap();
|
|
|
|
|
|
|
|
let rtxn = index.read_txn().unwrap();
|
2020-11-18 23:29:07 +08:00
|
|
|
// Only count the field_id 0 and level 0 facet values.
|
2021-05-03 21:58:47 +08:00
|
|
|
// TODO we must support typed CSVs for numbers to be understood.
|
|
|
|
let count = index.facet_id_f64_docids
|
|
|
|
.remap_key_type::<ByteSlice>()
|
|
|
|
.prefix_iter(&rtxn, &[0, 0]).unwrap().count();
|
2020-11-15 18:06:51 +08:00
|
|
|
assert_eq!(count, 4);
|
2021-03-30 01:15:47 +08:00
|
|
|
}
|
|
|
|
|
2021-06-01 22:29:14 +08:00
|
|
|
#[test]
|
|
|
|
fn set_asc_desc_field() {
|
|
|
|
let path = tempfile::tempdir().unwrap();
|
|
|
|
let mut options = EnvOpenOptions::new();
|
|
|
|
options.map_size(10 * 1024 * 1024); // 10 MB
|
|
|
|
let index = Index::new(options, &path).unwrap();
|
|
|
|
|
|
|
|
// Set the filterable fields to be the age.
|
|
|
|
let mut wtxn = index.write_txn().unwrap();
|
|
|
|
let mut builder = Settings::new(&mut wtxn, &index, 0);
|
|
|
|
// Don't display the generated `id` field.
|
|
|
|
builder.set_displayed_fields(vec![S("name"), S("age")]);
|
|
|
|
builder.set_criteria(vec![S("asc(age)")]);
|
|
|
|
builder.execute(|_, _| ()).unwrap();
|
|
|
|
|
|
|
|
// Then index some documents.
|
|
|
|
let content = &br#"[
|
|
|
|
{ "name": "kevin", "age": 23 },
|
|
|
|
{ "name": "kevina", "age": 21 },
|
|
|
|
{ "name": "benoit", "age": 34 }
|
|
|
|
]"#[..];
|
|
|
|
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
|
|
|
|
builder.update_format(UpdateFormat::Json);
|
|
|
|
builder.enable_autogenerate_docids();
|
|
|
|
builder.execute(content, |_, _| ()).unwrap();
|
|
|
|
wtxn.commit().unwrap();
|
|
|
|
|
|
|
|
// Run an empty query just to ensure that the search results are ordered.
|
|
|
|
let rtxn = index.read_txn().unwrap();
|
|
|
|
let SearchResult { documents_ids, .. } = index.search(&rtxn).execute().unwrap();
|
|
|
|
let documents = index.documents(&rtxn, documents_ids).unwrap();
|
|
|
|
|
|
|
|
// Fetch the documents "age" field in the ordre in which the documents appear.
|
|
|
|
let age_field_id = index.fields_ids_map(&rtxn).unwrap().id("age").unwrap();
|
|
|
|
let iter = documents.into_iter().map(|(_, doc)| {
|
|
|
|
let bytes = doc.get(age_field_id).unwrap();
|
|
|
|
let string = std::str::from_utf8(bytes).unwrap();
|
|
|
|
string.parse::<u32>().unwrap()
|
|
|
|
});
|
|
|
|
|
|
|
|
assert_eq!(iter.collect::<Vec<_>>(), vec![21, 23, 34]);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn set_distinct_field() {
|
|
|
|
let path = tempfile::tempdir().unwrap();
|
|
|
|
let mut options = EnvOpenOptions::new();
|
|
|
|
options.map_size(10 * 1024 * 1024); // 10 MB
|
|
|
|
let index = Index::new(options, &path).unwrap();
|
|
|
|
|
|
|
|
// Set the filterable fields to be the age.
|
|
|
|
let mut wtxn = index.write_txn().unwrap();
|
|
|
|
let mut builder = Settings::new(&mut wtxn, &index, 0);
|
|
|
|
// Don't display the generated `id` field.
|
|
|
|
builder.set_displayed_fields(vec![S("name"), S("age")]);
|
|
|
|
builder.set_distinct_field(S("age"));
|
|
|
|
builder.execute(|_, _| ()).unwrap();
|
|
|
|
|
|
|
|
// Then index some documents.
|
|
|
|
let content = &br#"[
|
|
|
|
{ "name": "kevin", "age": 23 },
|
|
|
|
{ "name": "kevina", "age": 21 },
|
|
|
|
{ "name": "benoit", "age": 34 },
|
|
|
|
{ "name": "bernard", "age": 34 },
|
|
|
|
{ "name": "bertrand", "age": 34 },
|
|
|
|
{ "name": "bernie", "age": 34 },
|
|
|
|
{ "name": "ben", "age": 34 }
|
|
|
|
]"#[..];
|
|
|
|
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
|
|
|
|
builder.update_format(UpdateFormat::Json);
|
|
|
|
builder.enable_autogenerate_docids();
|
|
|
|
builder.execute(content, |_, _| ()).unwrap();
|
|
|
|
wtxn.commit().unwrap();
|
|
|
|
|
|
|
|
// Run an empty query just to ensure that the search results are ordered.
|
|
|
|
let rtxn = index.read_txn().unwrap();
|
|
|
|
let SearchResult { documents_ids, .. } = index.search(&rtxn).execute().unwrap();
|
|
|
|
|
|
|
|
// There must be at least one document with a 34 as the age.
|
|
|
|
assert_eq!(documents_ids.len(), 3);
|
|
|
|
}
|
|
|
|
|
2021-03-30 01:15:47 +08:00
|
|
|
#[test]
|
|
|
|
fn default_stop_words() {
|
|
|
|
let path = tempfile::tempdir().unwrap();
|
|
|
|
let mut options = EnvOpenOptions::new();
|
|
|
|
options.map_size(10 * 1024 * 1024); // 10 MB
|
|
|
|
let index = Index::new(options, &path).unwrap();
|
|
|
|
|
|
|
|
// First we send 3 documents with ids from 1 to 3.
|
|
|
|
let mut wtxn = index.write_txn().unwrap();
|
|
|
|
let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
|
|
|
|
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
2021-05-01 02:34:29 +08:00
|
|
|
builder.enable_autogenerate_docids();
|
2021-03-30 01:15:47 +08:00
|
|
|
builder.update_format(UpdateFormat::Csv);
|
|
|
|
builder.execute(content, |_, _| ()).unwrap();
|
|
|
|
wtxn.commit().unwrap();
|
|
|
|
|
|
|
|
// Ensure there is no stop_words by default
|
|
|
|
let rtxn = index.read_txn().unwrap();
|
|
|
|
let stop_words = index.stop_words(&rtxn).unwrap();
|
|
|
|
assert!(stop_words.is_none());
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn set_and_reset_stop_words() {
|
|
|
|
let path = tempfile::tempdir().unwrap();
|
|
|
|
let mut options = EnvOpenOptions::new();
|
|
|
|
options.map_size(10 * 1024 * 1024); // 10 MB
|
|
|
|
let index = Index::new(options, &path).unwrap();
|
|
|
|
|
|
|
|
// First we send 3 documents with ids from 1 to 3.
|
|
|
|
let mut wtxn = index.write_txn().unwrap();
|
|
|
|
let content = &b"name,age,maxim\nkevin,23,I love dogs\nkevina,21,Doggos are the best\nbenoit,34,The crepes are really good\n"[..];
|
|
|
|
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
2021-05-01 02:34:29 +08:00
|
|
|
builder.enable_autogenerate_docids();
|
2021-03-30 01:15:47 +08:00
|
|
|
builder.update_format(UpdateFormat::Csv);
|
|
|
|
builder.execute(content, |_, _| ()).unwrap();
|
|
|
|
|
|
|
|
// In the same transaction we provide some stop_words
|
|
|
|
let mut builder = Settings::new(&mut wtxn, &index, 0);
|
2021-04-07 19:33:44 +08:00
|
|
|
let set = btreeset! { "i".to_string(), "the".to_string(), "are".to_string() };
|
2021-03-30 01:15:47 +08:00
|
|
|
builder.set_stop_words(set.clone());
|
|
|
|
builder.execute(|_, _| ()).unwrap();
|
|
|
|
wtxn.commit().unwrap();
|
|
|
|
|
|
|
|
// Ensure stop_words are effectively stored
|
|
|
|
let rtxn = index.read_txn().unwrap();
|
|
|
|
let stop_words = index.stop_words(&rtxn).unwrap();
|
|
|
|
assert!(stop_words.is_some()); // at this point the index should return something
|
|
|
|
|
|
|
|
let stop_words = stop_words.unwrap();
|
|
|
|
let expected = fst::Set::from_iter(&set).unwrap();
|
|
|
|
assert_eq!(stop_words.as_fst().as_bytes(), expected.as_fst().as_bytes());
|
|
|
|
|
|
|
|
// when we search for something that is a non prefix stop_words it should be ignored
|
2021-04-09 03:21:20 +08:00
|
|
|
// thus we should get a placeholder search (all the results = 3)
|
2021-03-30 01:15:47 +08:00
|
|
|
let result = index.search(&rtxn).query("the ").execute().unwrap();
|
2021-04-09 03:21:20 +08:00
|
|
|
assert_eq!(result.documents_ids.len(), 3);
|
2021-03-30 01:15:47 +08:00
|
|
|
let result = index.search(&rtxn).query("i ").execute().unwrap();
|
2021-04-09 03:21:20 +08:00
|
|
|
assert_eq!(result.documents_ids.len(), 3);
|
2021-03-30 01:15:47 +08:00
|
|
|
let result = index.search(&rtxn).query("are ").execute().unwrap();
|
2021-04-09 03:21:20 +08:00
|
|
|
assert_eq!(result.documents_ids.len(), 3);
|
2021-03-30 01:15:47 +08:00
|
|
|
|
|
|
|
let result = index.search(&rtxn).query("dog").execute().unwrap();
|
|
|
|
assert_eq!(result.documents_ids.len(), 2); // we have two maxims talking about doggos
|
|
|
|
let result = index.search(&rtxn).query("benoît").execute().unwrap();
|
|
|
|
assert_eq!(result.documents_ids.len(), 1); // there is one benoit in our data
|
|
|
|
|
|
|
|
// now we'll reset the stop_words and ensure it's None
|
|
|
|
let mut wtxn = index.write_txn().unwrap();
|
|
|
|
let mut builder = Settings::new(&mut wtxn, &index, 0);
|
|
|
|
builder.reset_stop_words();
|
|
|
|
builder.execute(|_, _| ()).unwrap();
|
|
|
|
wtxn.commit().unwrap();
|
|
|
|
|
|
|
|
let rtxn = index.read_txn().unwrap();
|
|
|
|
let stop_words = index.stop_words(&rtxn).unwrap();
|
|
|
|
assert!(stop_words.is_none());
|
|
|
|
|
|
|
|
// now we can search for the stop words
|
|
|
|
let result = index.search(&rtxn).query("the").execute().unwrap();
|
|
|
|
assert_eq!(result.documents_ids.len(), 2);
|
|
|
|
let result = index.search(&rtxn).query("i").execute().unwrap();
|
|
|
|
assert_eq!(result.documents_ids.len(), 1);
|
|
|
|
let result = index.search(&rtxn).query("are").execute().unwrap();
|
|
|
|
assert_eq!(result.documents_ids.len(), 2);
|
|
|
|
|
|
|
|
// the rest of the search is still not impacted
|
|
|
|
let result = index.search(&rtxn).query("dog").execute().unwrap();
|
|
|
|
assert_eq!(result.documents_ids.len(), 2); // we have two maxims talking about doggos
|
|
|
|
let result = index.search(&rtxn).query("benoît").execute().unwrap();
|
|
|
|
assert_eq!(result.documents_ids.len(), 1); // there is one benoit in our data
|
2020-11-13 21:49:48 +08:00
|
|
|
}
|
2021-01-21 00:27:43 +08:00
|
|
|
|
2021-04-10 03:56:20 +08:00
|
|
|
#[test]
|
|
|
|
fn set_and_reset_synonyms() {
|
|
|
|
let path = tempfile::tempdir().unwrap();
|
|
|
|
let mut options = EnvOpenOptions::new();
|
|
|
|
options.map_size(10 * 1024 * 1024); // 10 MB
|
|
|
|
let index = Index::new(options, &path).unwrap();
|
|
|
|
|
|
|
|
// Send 3 documents with ids from 1 to 3.
|
|
|
|
let mut wtxn = index.write_txn().unwrap();
|
|
|
|
let content = &b"name,age,maxim\nkevin,23,I love dogs\nkevina,21,Doggos are the best\nbenoit,34,The crepes are really good\n"[..];
|
|
|
|
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
2021-05-01 02:34:29 +08:00
|
|
|
builder.enable_autogenerate_docids();
|
2021-04-10 03:56:20 +08:00
|
|
|
builder.update_format(UpdateFormat::Csv);
|
|
|
|
builder.execute(content, |_, _| ()).unwrap();
|
|
|
|
|
|
|
|
// In the same transaction provide some synonyms
|
|
|
|
let mut builder = Settings::new(&mut wtxn, &index, 0);
|
|
|
|
builder.set_synonyms(hashmap! {
|
|
|
|
"blini".to_string() => vec!["crepes".to_string()],
|
|
|
|
"super like".to_string() => vec!["love".to_string()],
|
|
|
|
"puppies".to_string() => vec!["dogs".to_string(), "doggos".to_string()]
|
|
|
|
});
|
|
|
|
builder.execute(|_, _| ()).unwrap();
|
|
|
|
wtxn.commit().unwrap();
|
|
|
|
|
|
|
|
// Ensure synonyms are effectively stored
|
|
|
|
let rtxn = index.read_txn().unwrap();
|
|
|
|
let synonyms = index.synonyms(&rtxn).unwrap();
|
|
|
|
assert!(!synonyms.is_empty()); // at this point the index should return something
|
|
|
|
|
|
|
|
// Check that we can use synonyms
|
|
|
|
let result = index.search(&rtxn).query("blini").execute().unwrap();
|
|
|
|
assert_eq!(result.documents_ids.len(), 1);
|
|
|
|
let result = index.search(&rtxn).query("super like").execute().unwrap();
|
|
|
|
assert_eq!(result.documents_ids.len(), 1);
|
|
|
|
let result = index.search(&rtxn).query("puppies").execute().unwrap();
|
|
|
|
assert_eq!(result.documents_ids.len(), 2);
|
|
|
|
|
|
|
|
// Reset the synonyms
|
|
|
|
let mut wtxn = index.write_txn().unwrap();
|
|
|
|
let mut builder = Settings::new(&mut wtxn, &index, 0);
|
|
|
|
builder.reset_synonyms();
|
|
|
|
builder.execute(|_, _| ()).unwrap();
|
|
|
|
wtxn.commit().unwrap();
|
|
|
|
|
|
|
|
// Ensure synonyms are reset
|
|
|
|
let rtxn = index.read_txn().unwrap();
|
|
|
|
let synonyms = index.synonyms(&rtxn).unwrap();
|
|
|
|
assert!(synonyms.is_empty());
|
|
|
|
|
|
|
|
// Check that synonyms are no longer work
|
|
|
|
let result = index.search(&rtxn).query("blini").execute().unwrap();
|
|
|
|
assert!(result.documents_ids.is_empty());
|
|
|
|
let result = index.search(&rtxn).query("super like").execute().unwrap();
|
|
|
|
assert!(result.documents_ids.is_empty());
|
|
|
|
let result = index.search(&rtxn).query("puppies").execute().unwrap();
|
|
|
|
assert!(result.documents_ids.is_empty());
|
|
|
|
}
|
|
|
|
|
2021-01-21 00:27:43 +08:00
|
|
|
#[test]
|
|
|
|
fn setting_searchable_recomputes_other_settings() {
|
|
|
|
let path = tempfile::tempdir().unwrap();
|
|
|
|
let mut options = EnvOpenOptions::new();
|
|
|
|
options.map_size(10 * 1024 * 1024); // 10 MB
|
|
|
|
let index = Index::new(options, &path).unwrap();
|
|
|
|
|
|
|
|
// Set all the settings except searchable
|
|
|
|
let mut wtxn = index.write_txn().unwrap();
|
2020-12-22 23:21:07 +08:00
|
|
|
let mut builder = Settings::new(&mut wtxn, &index, 0);
|
2021-01-21 00:27:43 +08:00
|
|
|
builder.set_displayed_fields(vec!["hello".to_string()]);
|
2021-06-01 18:19:55 +08:00
|
|
|
builder.set_filterable_fields(hashset!{ S("age"), S("toto") });
|
2021-01-21 00:27:43 +08:00
|
|
|
builder.set_criteria(vec!["asc(toto)".to_string()]);
|
2020-12-22 23:21:07 +08:00
|
|
|
builder.execute(|_, _| ()).unwrap();
|
2021-01-21 00:27:43 +08:00
|
|
|
wtxn.commit().unwrap();
|
|
|
|
|
|
|
|
// check the output
|
|
|
|
let rtxn = index.read_txn().unwrap();
|
|
|
|
assert_eq!(&["hello"][..], index.displayed_fields(&rtxn).unwrap().unwrap());
|
|
|
|
// since no documents have been pushed the primary key is still unset
|
|
|
|
assert!(index.primary_key(&rtxn).unwrap().is_none());
|
|
|
|
assert_eq!(vec![Criterion::Asc("toto".to_string())], index.criteria(&rtxn).unwrap());
|
|
|
|
drop(rtxn);
|
|
|
|
|
|
|
|
// We set toto and age as searchable to force reordering of the fields
|
|
|
|
let mut wtxn = index.write_txn().unwrap();
|
2020-12-22 23:21:07 +08:00
|
|
|
let mut builder = Settings::new(&mut wtxn, &index, 1);
|
2021-01-21 00:27:43 +08:00
|
|
|
builder.set_searchable_fields(vec!["toto".to_string(), "age".to_string()]);
|
2020-12-22 23:21:07 +08:00
|
|
|
builder.execute(|_, _| ()).unwrap();
|
2021-01-21 00:27:43 +08:00
|
|
|
wtxn.commit().unwrap();
|
|
|
|
|
|
|
|
let rtxn = index.read_txn().unwrap();
|
|
|
|
assert_eq!(&["hello"][..], index.displayed_fields(&rtxn).unwrap().unwrap());
|
|
|
|
assert!(index.primary_key(&rtxn).unwrap().is_none());
|
|
|
|
assert_eq!(vec![Criterion::Asc("toto".to_string())], index.criteria(&rtxn).unwrap());
|
|
|
|
}
|
2021-06-01 21:48:38 +08:00
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn setting_not_filterable_cant_filter() {
|
|
|
|
let path = tempfile::tempdir().unwrap();
|
|
|
|
let mut options = EnvOpenOptions::new();
|
|
|
|
options.map_size(10 * 1024 * 1024); // 10 MB
|
|
|
|
let index = Index::new(options, &path).unwrap();
|
|
|
|
|
|
|
|
// Set all the settings except searchable
|
|
|
|
let mut wtxn = index.write_txn().unwrap();
|
|
|
|
let mut builder = Settings::new(&mut wtxn, &index, 0);
|
|
|
|
builder.set_displayed_fields(vec!["hello".to_string()]);
|
|
|
|
// It is only Asc(toto), there is a facet database but it is denied to filter with toto.
|
|
|
|
builder.set_criteria(vec!["asc(toto)".to_string()]);
|
|
|
|
builder.execute(|_, _| ()).unwrap();
|
|
|
|
wtxn.commit().unwrap();
|
|
|
|
|
|
|
|
let rtxn = index.read_txn().unwrap();
|
|
|
|
FilterCondition::from_str(&rtxn, &index, "toto = 32").unwrap_err();
|
|
|
|
}
|
2020-11-02 22:31:20 +08:00
|
|
|
}
|