From 4871509507e48e2514f5d0b1d89bfefe7f7f6d19 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 4 Oct 2022 11:06:48 +0200 Subject: [PATCH] Make sure that meilisearch-http works without index wrapper --- index-scheduler/src/index_mapper.rs | 8 +- index-scheduler/src/lib.rs | 2 +- index/src/lib.rs | 1 + index/src/search.rs | 188 +++++++++++++++++- .../src/routes/indexes/documents.rs | 5 +- meilisearch-http/src/routes/indexes/mod.rs | 31 ++- meilisearch-http/src/routes/indexes/search.rs | 6 +- .../src/routes/indexes/settings.rs | 6 +- meilisearch-http/src/routes/mod.rs | 26 ++- 9 files changed, 230 insertions(+), 43 deletions(-) diff --git a/index-scheduler/src/index_mapper.rs b/index-scheduler/src/index_mapper.rs index f39af072b..1f786c5f8 100644 --- a/index-scheduler/src/index_mapper.rs +++ b/index-scheduler/src/index_mapper.rs @@ -104,12 +104,14 @@ impl IndexMapper { Ok(index) } - pub fn indexes(&self, rtxn: &RoTxn) -> Result> { + pub fn indexes(&self, rtxn: &RoTxn) -> Result> { self.index_mapping .iter(rtxn)? .map(|ret| { - ret.map_err(Error::from) - .and_then(|(name, _)| self.index(rtxn, name)) + ret.map_err(Error::from).and_then(|(name, _)| { + self.index(rtxn, name) + .map(|index| (name.to_string(), index)) + }) }) .collect() } diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 921980ac7..080b39eb9 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -231,7 +231,7 @@ impl IndexScheduler { } /// Return and open all the indexes. - pub fn indexes(&self) -> Result> { + pub fn indexes(&self) -> Result> { let rtxn = self.env.read_txn()?; self.index_mapper.indexes(&rtxn) } diff --git a/index/src/lib.rs b/index/src/lib.rs index 401e77286..ce34626db 100644 --- a/index/src/lib.rs +++ b/index/src/lib.rs @@ -1,4 +1,5 @@ pub use search::{ + all_documents, perform_search, retrieve_document, retrieve_documents, settings, MatchingStrategy, SearchQuery, SearchResult, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, }; diff --git a/index/src/search.rs b/index/src/search.rs index fdd785c73..4cd5647f3 100644 --- a/index/src/search.rs +++ b/index/src/search.rs @@ -1,19 +1,25 @@ use std::cmp::min; use std::collections::{BTreeMap, BTreeSet, HashSet}; +use std::marker::PhantomData; use std::str::FromStr; use std::time::Instant; use either::Either; +use fst::IntoStreamer; +use milli::heed::RoTxn; use milli::tokenizer::TokenizerBuilder; +use milli::update::Setting; use milli::{ - AscDesc, FieldId, FieldsIdsMap, Filter, FormatOptions, Index, MatchBounds, MatcherBuilder, - SortError, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, + obkv_to_json, AscDesc, FieldId, FieldsIdsMap, Filter, FormatOptions, Index, MatchBounds, + MatcherBuilder, SortError, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, }; use regex::Regex; use serde::{Deserialize, Serialize}; use serde_json::{json, Value}; use crate::error::FacetError; +use crate::updates::{FacetingSettings, MinWordSizeTyposSetting, PaginationSettings, TypoSettings}; +use crate::{Checked, Settings}; use super::error::{IndexError, Result}; @@ -282,6 +288,184 @@ pub fn perform_search(index: &Index, query: SearchQuery) -> Result Ok(result) } +pub fn all_documents<'a>( + index: &Index, + rtxn: &'a RoTxn, +) -> Result> + 'a> { + let fields_ids_map = index.fields_ids_map(rtxn)?; + let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); + + Ok(index.all_documents(rtxn)?.map(move |ret| { + ret.map_err(IndexError::from) + .and_then(|(_key, document)| -> Result<_> { + Ok(obkv_to_json(&all_fields, &fields_ids_map, document)?) + }) + })) +} + +pub fn retrieve_documents>( + index: &Index, + offset: usize, + limit: usize, + attributes_to_retrieve: Option>, +) -> Result<(u64, Vec)> { + let rtxn = index.read_txn()?; + + let mut documents = Vec::new(); + for document in all_documents(index, &rtxn)?.skip(offset).take(limit) { + let document = match &attributes_to_retrieve { + Some(attributes_to_retrieve) => permissive_json_pointer::select_values( + &document?, + attributes_to_retrieve.iter().map(|s| s.as_ref()), + ), + None => document?, + }; + documents.push(document); + } + + let number_of_documents = index.number_of_documents(&rtxn)?; + Ok((number_of_documents, documents)) +} + +pub fn retrieve_document>( + index: &Index, + doc_id: &str, + attributes_to_retrieve: Option>, +) -> Result { + let txn = index.read_txn()?; + + let fields_ids_map = index.fields_ids_map(&txn)?; + let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); + + let internal_id = index + .external_documents_ids(&txn)? + .get(doc_id.as_bytes()) + .ok_or_else(|| IndexError::DocumentNotFound(doc_id.to_string()))?; + + let document = index + .documents(&txn, std::iter::once(internal_id))? + .into_iter() + .next() + .map(|(_, d)| d) + .ok_or_else(|| IndexError::DocumentNotFound(doc_id.to_string()))?; + + let document = obkv_to_json(&all_fields, &fields_ids_map, document)?; + let document = match &attributes_to_retrieve { + Some(attributes_to_retrieve) => permissive_json_pointer::select_values( + &document, + attributes_to_retrieve.iter().map(|s| s.as_ref()), + ), + None => document, + }; + + Ok(document) +} + +pub fn settings(index: &Index, rtxn: &RoTxn) -> Result> { + let displayed_attributes = index + .displayed_fields(rtxn)? + .map(|fields| fields.into_iter().map(String::from).collect()); + + let searchable_attributes = index + .user_defined_searchable_fields(rtxn)? + .map(|fields| fields.into_iter().map(String::from).collect()); + + let filterable_attributes = index.filterable_fields(rtxn)?.into_iter().collect(); + + let sortable_attributes = index.sortable_fields(rtxn)?.into_iter().collect(); + + let criteria = index + .criteria(rtxn)? + .into_iter() + .map(|c| c.to_string()) + .collect(); + + let stop_words = index + .stop_words(rtxn)? + .map(|stop_words| -> Result> { + Ok(stop_words.stream().into_strs()?.into_iter().collect()) + }) + .transpose()? + .unwrap_or_default(); + let distinct_field = index.distinct_field(rtxn)?.map(String::from); + + // in milli each word in the synonyms map were split on their separator. Since we lost + // this information we are going to put space between words. + let synonyms = index + .synonyms(rtxn)? + .iter() + .map(|(key, values)| { + ( + key.join(" "), + values.iter().map(|value| value.join(" ")).collect(), + ) + }) + .collect(); + + let min_typo_word_len = MinWordSizeTyposSetting { + one_typo: Setting::Set(index.min_word_len_one_typo(rtxn)?), + two_typos: Setting::Set(index.min_word_len_two_typos(rtxn)?), + }; + + let disabled_words = match index.exact_words(rtxn)? { + Some(fst) => fst.into_stream().into_strs()?.into_iter().collect(), + None => BTreeSet::new(), + }; + + let disabled_attributes = index + .exact_attributes(rtxn)? + .into_iter() + .map(String::from) + .collect(); + + let typo_tolerance = TypoSettings { + enabled: Setting::Set(index.authorize_typos(rtxn)?), + min_word_size_for_typos: Setting::Set(min_typo_word_len), + disable_on_words: Setting::Set(disabled_words), + disable_on_attributes: Setting::Set(disabled_attributes), + }; + + let faceting = FacetingSettings { + max_values_per_facet: Setting::Set( + index + .max_values_per_facet(rtxn)? + .unwrap_or(DEFAULT_VALUES_PER_FACET), + ), + }; + + let pagination = PaginationSettings { + max_total_hits: Setting::Set( + index + .pagination_max_total_hits(rtxn)? + .unwrap_or(DEFAULT_PAGINATION_MAX_TOTAL_HITS), + ), + }; + + Ok(Settings { + displayed_attributes: match displayed_attributes { + Some(attrs) => Setting::Set(attrs), + None => Setting::Reset, + }, + searchable_attributes: match searchable_attributes { + Some(attrs) => Setting::Set(attrs), + None => Setting::Reset, + }, + filterable_attributes: Setting::Set(filterable_attributes), + sortable_attributes: Setting::Set(sortable_attributes), + ranking_rules: Setting::Set(criteria), + stop_words: Setting::Set(stop_words), + distinct_attribute: match distinct_field { + Some(field) => Setting::Set(field), + None => Setting::Reset, + }, + synonyms: Setting::Set(synonyms), + typo_tolerance: Setting::Set(typo_tolerance), + faceting: Setting::Set(faceting), + pagination: Setting::Set(pagination), + _kind: PhantomData, + }) +} + fn insert_geo_distance(sorts: &[String], document: &mut Document) { lazy_static::lazy_static! { static ref GEO_REGEX: Regex = diff --git a/meilisearch-http/src/routes/indexes/documents.rs b/meilisearch-http/src/routes/indexes/documents.rs index 618787350..5f461693b 100644 --- a/meilisearch-http/src/routes/indexes/documents.rs +++ b/meilisearch-http/src/routes/indexes/documents.rs @@ -8,6 +8,7 @@ use actix_web::{web, HttpRequest, HttpResponse}; use bstr::ByteSlice; use document_formats::{read_csv, read_json, read_ndjson, PayloadType}; use futures::{Stream, StreamExt}; +use index::{retrieve_document, retrieve_documents}; use index_scheduler::milli::update::IndexDocumentsMethod; use index_scheduler::IndexScheduler; use index_scheduler::{KindWithContent, TaskView}; @@ -103,7 +104,7 @@ pub async fn get_document( let attributes_to_retrieve = fields.and_then(fold_star_or); let index = index_scheduler.index(&path.index_uid)?; - let document = index.retrieve_document(&path.document_id, attributes_to_retrieve)?; + let document = retrieve_document(&index, &path.document_id, attributes_to_retrieve)?; debug!("returns: {:?}", document); Ok(HttpResponse::Ok().json(document)) } @@ -149,7 +150,7 @@ pub async fn get_all_documents( let attributes_to_retrieve = fields.and_then(fold_star_or); let index = index_scheduler.index(&index_uid)?; - let (total, documents) = index.retrieve_documents(offset, limit, attributes_to_retrieve)?; + let (total, documents) = retrieve_documents(&index, offset, limit, attributes_to_retrieve)?; let ret = PaginationView::new(offset, limit, total as usize, documents); diff --git a/meilisearch-http/src/routes/indexes/mod.rs b/meilisearch-http/src/routes/indexes/mod.rs index f27190b18..c120d1e00 100644 --- a/meilisearch-http/src/routes/indexes/mod.rs +++ b/meilisearch-http/src/routes/indexes/mod.rs @@ -1,6 +1,6 @@ use actix_web::web::Data; use actix_web::{web, HttpRequest, HttpResponse}; -use index_scheduler::milli::FieldDistribution; +use index_scheduler::milli::{FieldDistribution, Index}; use index_scheduler::{IndexScheduler, KindWithContent, Query, Status}; use log::debug; use meilisearch_types::error::ResponseError; @@ -11,7 +11,6 @@ use time::OffsetDateTime; use crate::analytics::Analytics; use crate::extractors::authentication::{policies::*, AuthenticationError, GuardedData}; use crate::extractors::sequential_extractor::SeqHandler; -use index_scheduler::task::TaskView; use super::Pagination; @@ -51,15 +50,14 @@ pub struct IndexView { pub primary_key: Option, } -impl TryFrom<&Index> for IndexView { - type Error = index::error::IndexError; - - fn try_from(index: &Index) -> Result { +impl IndexView { + fn new(uid: String, index: &Index) -> Result { + let rtxn = index.read_txn()?; Ok(IndexView { - uid: index.name.clone(), - created_at: index.created_at()?, - updated_at: index.updated_at()?, - primary_key: index.primary_key()?, + uid, + created_at: index.created_at(&rtxn)?, + updated_at: index.updated_at(&rtxn)?, + primary_key: index.primary_key(&rtxn)?.map(String::from), }) } } @@ -71,9 +69,9 @@ pub async fn list_indexes( let search_rules = &index_scheduler.filters().search_rules; let indexes: Vec<_> = index_scheduler.indexes()?; let indexes = indexes - .iter() - .filter(|index| search_rules.is_index_authorized(&index.name)) - .map(IndexView::try_from) + .into_iter() + .filter(|(name, _)| search_rules.is_index_authorized(name)) + .map(|(name, index)| IndexView::new(name, &index)) .collect::, _>>()?; let ret = paginate.auto_paginate_sized(indexes.into_iter()); @@ -130,7 +128,7 @@ pub async fn get_index( index_uid: web::Path, ) -> Result { let index = index_scheduler.index(&index_uid)?; - let index_view: IndexView = (&index).try_into()?; + let index_view = IndexView::new(index_uid.into_inner(), &index)?; debug!("returns: {:?}", index_view); @@ -216,10 +214,11 @@ impl IndexStats { let is_processing = !processing_task.is_empty(); let index = index_scheduler.index(&index_uid)?; + let rtxn = index.read_txn()?; Ok(IndexStats { - number_of_documents: index.number_of_documents()?, + number_of_documents: index.number_of_documents(&rtxn)?, is_indexing: is_processing, - field_distribution: index.field_distribution()?, + field_distribution: index.field_distribution(&rtxn)?, }) } } diff --git a/meilisearch-http/src/routes/indexes/search.rs b/meilisearch-http/src/routes/indexes/search.rs index 8f91eaa6b..f19ebdaee 100644 --- a/meilisearch-http/src/routes/indexes/search.rs +++ b/meilisearch-http/src/routes/indexes/search.rs @@ -1,7 +1,7 @@ use actix_web::web::Data; use actix_web::{web, HttpRequest, HttpResponse}; use index::{ - MatchingStrategy, SearchQuery, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, + perform_search, MatchingStrategy, SearchQuery, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, }; use index_scheduler::IndexScheduler; @@ -151,7 +151,7 @@ pub async fn search_with_url_query( let mut aggregate = SearchAggregator::from_query(&query, &req); let index = index_scheduler.index(&index_uid)?; - let search_result = index.perform_search(query); + let search_result = perform_search(&index, query); if let Ok(ref search_result) = search_result { aggregate.succeed(search_result); } @@ -185,7 +185,7 @@ pub async fn search_with_post( let mut aggregate = SearchAggregator::from_query(&query, &req); let index = index_scheduler.index(&index_uid)?; - let search_result = index.perform_search(query); + let search_result = perform_search(&index, query); if let Ok(ref search_result) = search_result { aggregate.succeed(search_result); } diff --git a/meilisearch-http/src/routes/indexes/settings.rs b/meilisearch-http/src/routes/indexes/settings.rs index cd30cc950..b11a863bc 100644 --- a/meilisearch-http/src/routes/indexes/settings.rs +++ b/meilisearch-http/src/routes/indexes/settings.rs @@ -97,7 +97,8 @@ macro_rules! make_setting_route { index_uid: actix_web::web::Path, ) -> std::result::Result { let index = index_scheduler.index(&index_uid)?; - let settings = index.settings()?; + let rtxn = index.read_txn()?; + let settings = index::settings(&index, &rtxn)?; debug!("returns: {:?}", settings); let mut json = serde_json::json!(&settings); @@ -454,7 +455,8 @@ pub async fn get_all( index_uid: web::Path, ) -> Result { let index = index_scheduler.index(&index_uid)?; - let new_settings = index.settings()?; + let rtxn = index.read_txn()?; + let new_settings = index::settings(&index, &rtxn)?; debug!("returns: {:?}", new_settings); Ok(HttpResponse::Ok().json(new_settings)) } diff --git a/meilisearch-http/src/routes/mod.rs b/meilisearch-http/src/routes/mod.rs index 7cb893095..286225d7a 100644 --- a/meilisearch-http/src/routes/mod.rs +++ b/meilisearch-http/src/routes/mod.rs @@ -5,14 +5,11 @@ use actix_web::{web, HttpRequest, HttpResponse}; use index::{Settings, Unchecked}; use index_scheduler::{IndexScheduler, Query, Status}; use log::debug; -use serde::{Deserialize, Serialize}; - -use serde_json::json; -use time::OffsetDateTime; - -use index::{Settings, Unchecked}; use meilisearch_types::error::ResponseError; use meilisearch_types::star_or::StarOr; +use serde::{Deserialize, Serialize}; +use serde_json::json; +use time::OffsetDateTime; use crate::analytics::Analytics; use crate::extractors::authentication::{policies::*, GuardedData}; @@ -270,25 +267,26 @@ async fn get_stats( .first() .and_then(|task| task.index_uid.clone()); - for index in index_scheduler.indexes()? { - if !search_rules.is_index_authorized(&index.name) { + for (name, index) in index_scheduler.indexes()? { + if !search_rules.is_index_authorized(&name) { continue; } - database_size += index.size()?; + database_size += index.on_disk_size()?; + let rtxn = index.read_txn()?; let stats = IndexStats { - number_of_documents: index.number_of_documents()?, + number_of_documents: index.number_of_documents(&rtxn)?, is_indexing: processing_index .as_deref() - .map_or(false, |index_name| index.name == index_name), - field_distribution: index.field_distribution()?, + .map_or(false, |index_name| name == index_name), + field_distribution: index.field_distribution(&rtxn)?, }; - let updated_at = index.updated_at()?; + let updated_at = index.updated_at(&rtxn)?; last_task = last_task.map_or(Some(updated_at), |last| Some(last.max(updated_at))); - indexes.insert(index.name.clone(), stats); + indexes.insert(name, stats); } let stats = Stats {