Make sure that meilisearch-http works without index wrapper

2025-01-18 17:11:15 +08:00 · 2022-10-04 11:06:48 +02:00 · 2022-10-04 11:06:48 +02:00 · cf6084151b
commit cf6084151b
parent c70f375669
9 changed files with 230 additions and 43 deletions
--- a/index-scheduler/src/index_mapper.rs
+++ b/index-scheduler/src/index_mapper.rs
@ -104,12 +104,14 @@ impl IndexMapper {
        Ok(index)
    }

-    pub fn indexes(&self, rtxn: &RoTxn) -> Result<Vec<Index>> {
+    pub fn indexes(&self, rtxn: &RoTxn) -> Result<Vec<(String, Index)>> {
        self.index_mapping
            .iter(rtxn)?
            .map(|ret| {
-                ret.map_err(Error::from)
-                    .and_then(|(name, _)| self.index(rtxn, name))
+                ret.map_err(Error::from).and_then(|(name, _)| {
+                    self.index(rtxn, name)
+                        .map(|index| (name.to_string(), index))
+                })
            })
            .collect()
    }
--- a/index-scheduler/src/lib.rs
+++ b/index-scheduler/src/lib.rs
@ -231,7 +231,7 @@ impl IndexScheduler {
    }

    /// Return and open all the indexes.
-    pub fn indexes(&self) -> Result<Vec<Index>> {
+    pub fn indexes(&self) -> Result<Vec<(String, Index)>> {
        let rtxn = self.env.read_txn()?;
        self.index_mapper.indexes(&rtxn)
    }
--- a/index/src/lib.rs
+++ b/index/src/lib.rs
@ -1,4 +1,5 @@
 pub use search::{
+    all_documents, perform_search, retrieve_document, retrieve_documents, settings,
    MatchingStrategy, SearchQuery, SearchResult, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER,
    DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT,
 };
--- a/index/src/search.rs
+++ b/index/src/search.rs
@ -1,19 +1,25 @@
 use std::cmp::min;
 use std::collections::{BTreeMap, BTreeSet, HashSet};
+use std::marker::PhantomData;
 use std::str::FromStr;
 use std::time::Instant;

 use either::Either;
+use fst::IntoStreamer;
+use milli::heed::RoTxn;
 use milli::tokenizer::TokenizerBuilder;
+use milli::update::Setting;
 use milli::{
-    AscDesc, FieldId, FieldsIdsMap, Filter, FormatOptions, Index, MatchBounds, MatcherBuilder,
-    SortError, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
+    obkv_to_json, AscDesc, FieldId, FieldsIdsMap, Filter, FormatOptions, Index, MatchBounds,
+    MatcherBuilder, SortError, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
 };
 use regex::Regex;
 use serde::{Deserialize, Serialize};
 use serde_json::{json, Value};

 use crate::error::FacetError;
+use crate::updates::{FacetingSettings, MinWordSizeTyposSetting, PaginationSettings, TypoSettings};
+use crate::{Checked, Settings};

 use super::error::{IndexError, Result};

@ -282,6 +288,184 @@ pub fn perform_search(index: &Index, query: SearchQuery) -> Result<SearchResult>
    Ok(result)
 }

+pub fn all_documents<'a>(
+    index: &Index,
+    rtxn: &'a RoTxn,
+) -> Result<impl Iterator<Item = Result<Document>> + 'a> {
+    let fields_ids_map = index.fields_ids_map(rtxn)?;
+    let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
+
+    Ok(index.all_documents(rtxn)?.map(move |ret| {
+        ret.map_err(IndexError::from)
+            .and_then(|(_key, document)| -> Result<_> {
+                Ok(obkv_to_json(&all_fields, &fields_ids_map, document)?)
+            })
+    }))
+}
+
+pub fn retrieve_documents<S: AsRef<str>>(
+    index: &Index,
+    offset: usize,
+    limit: usize,
+    attributes_to_retrieve: Option<Vec<S>>,
+) -> Result<(u64, Vec<Document>)> {
+    let rtxn = index.read_txn()?;
+
+    let mut documents = Vec::new();
+    for document in all_documents(index, &rtxn)?.skip(offset).take(limit) {
+        let document = match &attributes_to_retrieve {
+            Some(attributes_to_retrieve) => permissive_json_pointer::select_values(
+                &document?,
+                attributes_to_retrieve.iter().map(|s| s.as_ref()),
+            ),
+            None => document?,
+        };
+        documents.push(document);
+    }
+
+    let number_of_documents = index.number_of_documents(&rtxn)?;
+    Ok((number_of_documents, documents))
+}
+
+pub fn retrieve_document<S: AsRef<str>>(
+    index: &Index,
+    doc_id: &str,
+    attributes_to_retrieve: Option<Vec<S>>,
+) -> Result<Document> {
+    let txn = index.read_txn()?;
+
+    let fields_ids_map = index.fields_ids_map(&txn)?;
+    let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
+
+    let internal_id = index
+        .external_documents_ids(&txn)?
+        .get(doc_id.as_bytes())
+        .ok_or_else(|| IndexError::DocumentNotFound(doc_id.to_string()))?;
+
+    let document = index
+        .documents(&txn, std::iter::once(internal_id))?
+        .into_iter()
+        .next()
+        .map(|(_, d)| d)
+        .ok_or_else(|| IndexError::DocumentNotFound(doc_id.to_string()))?;
+
+    let document = obkv_to_json(&all_fields, &fields_ids_map, document)?;
+    let document = match &attributes_to_retrieve {
+        Some(attributes_to_retrieve) => permissive_json_pointer::select_values(
+            &document,
+            attributes_to_retrieve.iter().map(|s| s.as_ref()),
+        ),
+        None => document,
+    };
+
+    Ok(document)
+}
+
+pub fn settings(index: &Index, rtxn: &RoTxn) -> Result<Settings<Checked>> {
+    let displayed_attributes = index
+        .displayed_fields(rtxn)?
+        .map(|fields| fields.into_iter().map(String::from).collect());
+
+    let searchable_attributes = index
+        .user_defined_searchable_fields(rtxn)?
+        .map(|fields| fields.into_iter().map(String::from).collect());
+
+    let filterable_attributes = index.filterable_fields(rtxn)?.into_iter().collect();
+
+    let sortable_attributes = index.sortable_fields(rtxn)?.into_iter().collect();
+
+    let criteria = index
+        .criteria(rtxn)?
+        .into_iter()
+        .map(|c| c.to_string())
+        .collect();
+
+    let stop_words = index
+        .stop_words(rtxn)?
+        .map(|stop_words| -> Result<BTreeSet<_>> {
+            Ok(stop_words.stream().into_strs()?.into_iter().collect())
+        })
+        .transpose()?
+        .unwrap_or_default();
+    let distinct_field = index.distinct_field(rtxn)?.map(String::from);
+
+    // in milli each word in the synonyms map were split on their separator. Since we lost
+    // this information we are going to put space between words.
+    let synonyms = index
+        .synonyms(rtxn)?
+        .iter()
+        .map(|(key, values)| {
+            (
+                key.join(" "),
+                values.iter().map(|value| value.join(" ")).collect(),
+            )
+        })
+        .collect();
+
+    let min_typo_word_len = MinWordSizeTyposSetting {
+        one_typo: Setting::Set(index.min_word_len_one_typo(rtxn)?),
+        two_typos: Setting::Set(index.min_word_len_two_typos(rtxn)?),
+    };
+
+    let disabled_words = match index.exact_words(rtxn)? {
+        Some(fst) => fst.into_stream().into_strs()?.into_iter().collect(),
+        None => BTreeSet::new(),
+    };
+
+    let disabled_attributes = index
+        .exact_attributes(rtxn)?
+        .into_iter()
+        .map(String::from)
+        .collect();
+
+    let typo_tolerance = TypoSettings {
+        enabled: Setting::Set(index.authorize_typos(rtxn)?),
+        min_word_size_for_typos: Setting::Set(min_typo_word_len),
+        disable_on_words: Setting::Set(disabled_words),
+        disable_on_attributes: Setting::Set(disabled_attributes),
+    };
+
+    let faceting = FacetingSettings {
+        max_values_per_facet: Setting::Set(
+            index
+                .max_values_per_facet(rtxn)?
+                .unwrap_or(DEFAULT_VALUES_PER_FACET),
+        ),
+    };
+
+    let pagination = PaginationSettings {
+        max_total_hits: Setting::Set(
+            index
+                .pagination_max_total_hits(rtxn)?
+                .unwrap_or(DEFAULT_PAGINATION_MAX_TOTAL_HITS),
+        ),
+    };
+
+    Ok(Settings {
+        displayed_attributes: match displayed_attributes {
+            Some(attrs) => Setting::Set(attrs),
+            None => Setting::Reset,
+        },
+        searchable_attributes: match searchable_attributes {
+            Some(attrs) => Setting::Set(attrs),
+            None => Setting::Reset,
+        },
+        filterable_attributes: Setting::Set(filterable_attributes),
+        sortable_attributes: Setting::Set(sortable_attributes),
+        ranking_rules: Setting::Set(criteria),
+        stop_words: Setting::Set(stop_words),
+        distinct_attribute: match distinct_field {
+            Some(field) => Setting::Set(field),
+            None => Setting::Reset,
+        },
+        synonyms: Setting::Set(synonyms),
+        typo_tolerance: Setting::Set(typo_tolerance),
+        faceting: Setting::Set(faceting),
+        pagination: Setting::Set(pagination),
+        _kind: PhantomData,
+    })
+}
+
 fn insert_geo_distance(sorts: &[String], document: &mut Document) {
    lazy_static::lazy_static! {
        static ref GEO_REGEX: Regex =
--- a/meilisearch-http/src/routes/indexes/documents.rs
+++ b/meilisearch-http/src/routes/indexes/documents.rs
@ -8,6 +8,7 @@ use actix_web::{web, HttpRequest, HttpResponse};
 use bstr::ByteSlice;
 use document_formats::{read_csv, read_json, read_ndjson, PayloadType};
 use futures::{Stream, StreamExt};
+use index::{retrieve_document, retrieve_documents};
 use index_scheduler::milli::update::IndexDocumentsMethod;
 use index_scheduler::IndexScheduler;
 use index_scheduler::{KindWithContent, TaskView};
@ -103,7 +104,7 @@ pub async fn get_document(
    let attributes_to_retrieve = fields.and_then(fold_star_or);

    let index = index_scheduler.index(&path.index_uid)?;
-    let document = index.retrieve_document(&path.document_id, attributes_to_retrieve)?;
+    let document = retrieve_document(&index, &path.document_id, attributes_to_retrieve)?;
    debug!("returns: {:?}", document);
    Ok(HttpResponse::Ok().json(document))
 }
@ -149,7 +150,7 @@ pub async fn get_all_documents(
    let attributes_to_retrieve = fields.and_then(fold_star_or);

    let index = index_scheduler.index(&index_uid)?;
-    let (total, documents) = index.retrieve_documents(offset, limit, attributes_to_retrieve)?;
+    let (total, documents) = retrieve_documents(&index, offset, limit, attributes_to_retrieve)?;

    let ret = PaginationView::new(offset, limit, total as usize, documents);

--- a/meilisearch-http/src/routes/indexes/mod.rs
+++ b/meilisearch-http/src/routes/indexes/mod.rs
@ -1,6 +1,6 @@
 use actix_web::web::Data;
 use actix_web::{web, HttpRequest, HttpResponse};
-use index_scheduler::milli::FieldDistribution;
+use index_scheduler::milli::{FieldDistribution, Index};
 use index_scheduler::{IndexScheduler, KindWithContent, Query, Status};
 use log::debug;
 use meilisearch_types::error::ResponseError;
@ -11,7 +11,6 @@ use time::OffsetDateTime;
 use crate::analytics::Analytics;
 use crate::extractors::authentication::{policies::*, AuthenticationError, GuardedData};
 use crate::extractors::sequential_extractor::SeqHandler;
-use index_scheduler::task::TaskView;

 use super::Pagination;

@ -51,15 +50,14 @@ pub struct IndexView {
    pub primary_key: Option<String>,
 }

-impl TryFrom<&Index> for IndexView {
-    type Error = index::error::IndexError;
-
-    fn try_from(index: &Index) -> Result<IndexView, Self::Error> {
+impl IndexView {
+    fn new(uid: String, index: &Index) -> Result<IndexView, index::error::IndexError> {
+        let rtxn = index.read_txn()?;
        Ok(IndexView {
-            uid: index.name.clone(),
-            created_at: index.created_at()?,
-            updated_at: index.updated_at()?,
-            primary_key: index.primary_key()?,
+            uid,
+            created_at: index.created_at(&rtxn)?,
+            updated_at: index.updated_at(&rtxn)?,
+            primary_key: index.primary_key(&rtxn)?.map(String::from),
        })
    }
 }
@ -71,9 +69,9 @@ pub async fn list_indexes(
    let search_rules = &index_scheduler.filters().search_rules;
    let indexes: Vec<_> = index_scheduler.indexes()?;
    let indexes = indexes
-        .iter()
-        .filter(|index| search_rules.is_index_authorized(&index.name))
-        .map(IndexView::try_from)
+        .into_iter()
+        .filter(|(name, _)| search_rules.is_index_authorized(name))
+        .map(|(name, index)| IndexView::new(name, &index))
        .collect::<Result<Vec<_>, _>>()?;

    let ret = paginate.auto_paginate_sized(indexes.into_iter());
@ -130,7 +128,7 @@ pub async fn get_index(
    index_uid: web::Path<String>,
 ) -> Result<HttpResponse, ResponseError> {
    let index = index_scheduler.index(&index_uid)?;
-    let index_view: IndexView = (&index).try_into()?;
+    let index_view = IndexView::new(index_uid.into_inner(), &index)?;

    debug!("returns: {:?}", index_view);

@ -216,10 +214,11 @@ impl IndexStats {
        let is_processing = !processing_task.is_empty();

        let index = index_scheduler.index(&index_uid)?;
+        let rtxn = index.read_txn()?;
        Ok(IndexStats {
-            number_of_documents: index.number_of_documents()?,
+            number_of_documents: index.number_of_documents(&rtxn)?,
            is_indexing: is_processing,
-            field_distribution: index.field_distribution()?,
+            field_distribution: index.field_distribution(&rtxn)?,
        })
    }
 }
--- a/meilisearch-http/src/routes/indexes/search.rs
+++ b/meilisearch-http/src/routes/indexes/search.rs
@ -1,7 +1,7 @@
 use actix_web::web::Data;
 use actix_web::{web, HttpRequest, HttpResponse};
 use index::{
-    MatchingStrategy, SearchQuery, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER,
+    perform_search, MatchingStrategy, SearchQuery, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER,
    DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT,
    DEFAULT_SEARCH_OFFSET,
 };
@ -158,7 +158,7 @@ pub async fn search_with_url_query(
    let mut aggregate = SearchAggregator::from_query(&query, &req);

    let index = index_scheduler.index(&index_uid)?;
-    let search_result = index.perform_search(query);
+    let search_result = perform_search(&index, query);
    if let Ok(ref search_result) = search_result {
        aggregate.succeed(search_result);
    }
@ -192,7 +192,7 @@ pub async fn search_with_post(
    let mut aggregate = SearchAggregator::from_query(&query, &req);

    let index = index_scheduler.index(&index_uid)?;
-    let search_result = index.perform_search(query);
+    let search_result = perform_search(&index, query);
    if let Ok(ref search_result) = search_result {
        aggregate.succeed(search_result);
    }
--- a/meilisearch-http/src/routes/indexes/settings.rs
+++ b/meilisearch-http/src/routes/indexes/settings.rs
@ -97,7 +97,8 @@ macro_rules! make_setting_route {
                index_uid: actix_web::web::Path<String>,
            ) -> std::result::Result<HttpResponse, ResponseError> {
                let index = index_scheduler.index(&index_uid)?;
-                let settings = index.settings()?;
+                let rtxn = index.read_txn()?;
+                let settings = index::settings(&index, &rtxn)?;

                debug!("returns: {:?}", settings);
                let mut json = serde_json::json!(&settings);
@ -454,7 +455,8 @@ pub async fn get_all(
    index_uid: web::Path<String>,
 ) -> Result<HttpResponse, ResponseError> {
    let index = index_scheduler.index(&index_uid)?;
-    let new_settings = index.settings()?;
+    let rtxn = index.read_txn()?;
+    let new_settings = index::settings(&index, &rtxn)?;
    debug!("returns: {:?}", new_settings);
    Ok(HttpResponse::Ok().json(new_settings))
 }
--- a/meilisearch-http/src/routes/mod.rs
+++ b/meilisearch-http/src/routes/mod.rs
@ -5,14 +5,11 @@ use actix_web::{web, HttpRequest, HttpResponse};
 use index::{Settings, Unchecked};
 use index_scheduler::{IndexScheduler, Query, Status};
 use log::debug;
-use serde::{Deserialize, Serialize};
-
-use serde_json::json;
-use time::OffsetDateTime;
-
-use index::{Settings, Unchecked};
 use meilisearch_types::error::ResponseError;
 use meilisearch_types::star_or::StarOr;
+use serde::{Deserialize, Serialize};
+use serde_json::json;
+use time::OffsetDateTime;

 use crate::analytics::Analytics;
 use crate::extractors::authentication::{policies::*, GuardedData};
@ -270,25 +267,26 @@ async fn get_stats(
        .first()
        .and_then(|task| task.index_uid.clone());

-    for index in index_scheduler.indexes()? {
-        if !search_rules.is_index_authorized(&index.name) {
+    for (name, index) in index_scheduler.indexes()? {
+        if !search_rules.is_index_authorized(&name) {
            continue;
        }

-        database_size += index.size()?;
+        database_size += index.on_disk_size()?;

+        let rtxn = index.read_txn()?;
        let stats = IndexStats {
-            number_of_documents: index.number_of_documents()?,
+            number_of_documents: index.number_of_documents(&rtxn)?,
            is_indexing: processing_index
                .as_deref()
-                .map_or(false, |index_name| index.name == index_name),
-            field_distribution: index.field_distribution()?,
+                .map_or(false, |index_name| name == index_name),
+            field_distribution: index.field_distribution(&rtxn)?,
        };

-        let updated_at = index.updated_at()?;
+        let updated_at = index.updated_at(&rtxn)?;
        last_task = last_task.map_or(Some(updated_at), |last| Some(last.max(updated_at)));

-        indexes.insert(index.name.clone(), stats);
+        indexes.insert(name, stats);
    }

    let stats = Stats {