meilisearch/meilisearch-lib/src/index_controller/mod.rs

500 lines
15 KiB
Rust
Raw Normal View History

2021-05-11 02:25:09 +08:00
use std::collections::BTreeMap;
2021-09-30 17:29:27 +08:00
use std::fmt;
2021-09-15 00:39:02 +08:00
use std::path::{Path, PathBuf};
2021-03-06 19:57:56 +08:00
use std::sync::Arc;
use std::time::Duration;
2021-03-04 19:03:06 +08:00
2021-09-15 00:39:02 +08:00
use actix_web::error::PayloadError;
use bytes::Bytes;
2021-05-11 02:24:14 +08:00
use chrono::{DateTime, Utc};
2021-09-15 00:39:02 +08:00
use futures::Stream;
2021-03-24 18:29:11 +08:00
use log::info;
2021-09-15 00:39:02 +08:00
use milli::update::IndexDocumentsMethod;
2021-03-16 01:11:10 +08:00
use serde::{Deserialize, Serialize};
2021-09-24 17:53:11 +08:00
use tokio::task::spawn_blocking;
2021-03-06 19:57:56 +08:00
use tokio::time::sleep;
use uuid::Uuid;
2021-05-11 02:25:09 +08:00
use dump_actor::DumpActorHandle;
2021-05-27 02:42:09 +08:00
pub use dump_actor::{DumpInfo, DumpStatus};
2021-09-15 00:39:02 +08:00
use snapshot::load_snapshot;
2021-04-01 22:44:42 +08:00
2021-09-29 04:22:59 +08:00
use crate::index::error::Result as IndexResult;
use crate::index::{
Checked, Document, IndexMeta, IndexStats, SearchQuery, SearchResult, Settings, Unchecked,
};
2021-09-24 17:53:11 +08:00
use crate::index_controller::index_resolver::create_index_resolver;
2021-09-27 22:48:03 +08:00
use crate::index_controller::snapshot::SnapshotService;
2021-09-22 21:07:04 +08:00
use crate::options::IndexerOpts;
2021-06-23 20:48:33 +08:00
use error::Result;
2021-04-01 22:44:42 +08:00
2021-05-27 20:30:20 +08:00
use self::dump_actor::load_dump;
2021-09-24 17:53:11 +08:00
use self::index_resolver::error::IndexResolverError;
2021-09-29 04:22:59 +08:00
use self::index_resolver::HardStateIndexResolver;
2021-09-22 17:52:29 +08:00
use self::updates::status::UpdateStatus;
2021-09-22 21:07:04 +08:00
use self::updates::UpdateMsg;
2021-05-27 20:30:20 +08:00
2021-05-27 02:42:09 +08:00
mod dump_actor;
2021-06-15 23:39:07 +08:00
pub mod error;
2021-09-29 04:22:59 +08:00
mod index_resolver;
2021-04-01 22:44:42 +08:00
mod snapshot;
2021-09-22 21:07:04 +08:00
pub mod update_file_store;
2021-09-22 17:52:29 +08:00
pub mod updates;
2021-09-15 00:39:02 +08:00
2021-09-22 21:07:04 +08:00
pub type Payload = Box<
dyn Stream<Item = std::result::Result<Bytes, PayloadError>> + Send + Sync + 'static + Unpin,
>;
2021-02-02 02:51:47 +08:00
2021-02-04 00:44:20 +08:00
#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(rename_all = "camelCase")]
pub struct IndexMetadata {
2021-04-15 00:55:04 +08:00
#[serde(skip)]
pub uuid: Uuid,
2021-04-01 22:44:42 +08:00
pub uid: String,
2021-03-16 01:35:16 +08:00
name: String,
2021-03-07 03:12:20 +08:00
#[serde(flatten)]
2021-09-24 17:53:11 +08:00
pub meta: IndexMeta,
2021-02-04 00:44:20 +08:00
}
2021-02-09 23:08:13 +08:00
#[derive(Clone, Debug)]
pub struct IndexSettings {
2021-03-12 05:47:29 +08:00
pub uid: Option<String>,
pub primary_key: Option<String>,
}
2021-03-04 19:03:06 +08:00
2021-05-11 02:25:09 +08:00
#[derive(Clone)]
2021-03-04 19:03:06 +08:00
pub struct IndexController {
2021-09-24 17:53:11 +08:00
index_resolver: Arc<HardStateIndexResolver>,
2021-09-27 22:48:03 +08:00
update_sender: updates::UpdateSender,
2021-05-11 02:25:09 +08:00
dump_handle: dump_actor::DumpActorHandleImpl,
2021-03-04 19:03:06 +08:00
}
2021-09-22 21:07:04 +08:00
#[derive(Debug)]
2021-09-15 00:39:02 +08:00
pub enum DocumentAdditionFormat {
Json,
2021-09-22 22:01:21 +08:00
Csv,
2021-09-29 16:17:52 +08:00
Ndjson,
2021-09-15 00:39:02 +08:00
}
2021-09-30 17:29:27 +08:00
impl fmt::Display for DocumentAdditionFormat {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
DocumentAdditionFormat::Json => write!(f, "json"),
DocumentAdditionFormat::Ndjson => write!(f, "ndjson"),
DocumentAdditionFormat::Csv => write!(f, "csv"),
}
}
}
2021-06-23 18:18:34 +08:00
#[derive(Serialize, Debug)]
2021-04-16 01:54:25 +08:00
#[serde(rename_all = "camelCase")]
2021-04-15 00:55:04 +08:00
pub struct Stats {
pub database_size: u64,
pub last_update: Option<DateTime<Utc>>,
pub indexes: BTreeMap<String, IndexStats>,
}
2021-09-29 04:22:59 +08:00
#[allow(clippy::large_enum_variant)]
2021-09-22 21:07:04 +08:00
#[derive(derivative::Derivative)]
#[derivative(Debug)]
2021-09-15 00:39:02 +08:00
pub enum Update {
2021-09-24 21:21:07 +08:00
DeleteDocuments(Vec<String>),
ClearDocuments,
2021-09-24 20:55:57 +08:00
Settings(Settings<Unchecked>),
2021-09-15 00:39:02 +08:00
DocumentAddition {
2021-09-29 04:22:59 +08:00
#[derivative(Debug = "ignore")]
2021-09-15 00:39:02 +08:00
payload: Payload,
primary_key: Option<String>,
method: IndexDocumentsMethod,
format: DocumentAdditionFormat,
2021-09-22 21:07:04 +08:00
},
2021-09-15 00:39:02 +08:00
}
#[derive(Default, Debug)]
pub struct IndexControllerBuilder {
max_index_size: Option<usize>,
max_update_store_size: Option<usize>,
snapshot_dir: Option<PathBuf>,
import_snapshot: Option<PathBuf>,
2021-09-27 22:48:03 +08:00
snapshot_interval: Option<Duration>,
ignore_snapshot_if_db_exists: bool,
ignore_missing_snapshot: bool,
2021-09-27 22:48:03 +08:00
schedule_snapshot: bool,
dump_src: Option<PathBuf>,
dump_dst: Option<PathBuf>,
}
2021-03-17 19:01:56 +08:00
impl IndexControllerBuilder {
2021-09-22 21:07:04 +08:00
pub fn build(
self,
db_path: impl AsRef<Path>,
indexer_options: IndexerOpts,
) -> anyhow::Result<IndexController> {
let index_size = self
.max_index_size
.ok_or_else(|| anyhow::anyhow!("Missing index size"))?;
let update_store_size = self
.max_index_size
.ok_or_else(|| anyhow::anyhow!("Missing update database size"))?;
if let Some(ref path) = self.import_snapshot {
2021-03-24 18:29:11 +08:00
info!("Loading from snapshot {:?}", path);
2021-03-23 23:37:46 +08:00
load_snapshot(
db_path.as_ref(),
2021-03-23 23:37:46 +08:00
path,
self.ignore_snapshot_if_db_exists,
self.ignore_missing_snapshot,
2021-03-23 23:37:46 +08:00
)?;
} else if let Some(ref src_path) = self.dump_src {
2021-05-27 20:30:20 +08:00
load_dump(
db_path.as_ref(),
2021-05-27 20:30:20 +08:00
src_path,
index_size,
update_store_size,
&indexer_options,
2021-05-27 20:30:20 +08:00
)?;
2021-03-23 02:19:37 +08:00
}
std::fs::create_dir_all(db_path.as_ref())?;
2021-03-24 00:23:57 +08:00
2021-09-29 04:22:59 +08:00
let index_resolver = Arc::new(create_index_resolver(
&db_path,
index_size,
&indexer_options,
)?);
2021-09-22 17:52:29 +08:00
#[allow(unreachable_code)]
2021-09-29 04:22:59 +08:00
let update_sender =
updates::create_update_handler(index_resolver.clone(), &db_path, update_store_size)?;
2021-09-29 04:22:59 +08:00
let dump_path = self
.dump_dst
.ok_or_else(|| anyhow::anyhow!("Missing dump directory path"))?;
2021-05-27 02:42:09 +08:00
let dump_handle = dump_actor::DumpActorHandleImpl::new(
2021-09-24 17:53:11 +08:00
dump_path,
index_resolver.clone(),
2021-09-27 22:48:03 +08:00
update_sender.clone(),
index_size,
update_store_size,
2021-05-27 02:42:09 +08:00
)?;
2021-03-17 19:01:56 +08:00
2021-09-27 22:48:03 +08:00
if self.schedule_snapshot {
let snapshot_service = SnapshotService::new(
index_resolver.clone(),
update_sender.clone(),
2021-09-29 04:22:59 +08:00
self.snapshot_interval
.ok_or_else(|| anyhow::anyhow!("Snapshot interval not provided."))?,
self.snapshot_dir
.ok_or_else(|| anyhow::anyhow!("Snapshot path not provided."))?,
2021-09-27 22:48:03 +08:00
db_path
2021-09-29 04:22:59 +08:00
.as_ref()
.file_name()
.map(|n| n.to_owned().into_string().expect("invalid path"))
.unwrap_or_else(|| String::from("data.ms")),
2021-09-27 22:48:03 +08:00
);
tokio::task::spawn(snapshot_service.run());
}
2021-03-17 19:01:56 +08:00
Ok(IndexController {
2021-09-24 17:53:11 +08:00
index_resolver,
2021-09-27 22:48:03 +08:00
update_sender,
2021-05-11 02:25:09 +08:00
dump_handle,
2021-03-16 01:11:10 +08:00
})
2021-03-04 19:03:06 +08:00
}
/// Set the index controller builder's max update store size.
pub fn set_max_update_store_size(&mut self, max_update_store_size: usize) -> &mut Self {
self.max_update_store_size.replace(max_update_store_size);
self
}
pub fn set_max_index_size(&mut self, size: usize) -> &mut Self {
self.max_index_size.replace(size);
self
}
/// Set the index controller builder's snapshot path.
pub fn set_snapshot_dir(&mut self, snapshot_dir: PathBuf) -> &mut Self {
self.snapshot_dir.replace(snapshot_dir);
self
}
/// Set the index controller builder's ignore snapshot if db exists.
2021-09-22 21:07:04 +08:00
pub fn set_ignore_snapshot_if_db_exists(
&mut self,
ignore_snapshot_if_db_exists: bool,
) -> &mut Self {
self.ignore_snapshot_if_db_exists = ignore_snapshot_if_db_exists;
self
}
/// Set the index controller builder's ignore missing snapshot.
pub fn set_ignore_missing_snapshot(&mut self, ignore_missing_snapshot: bool) -> &mut Self {
self.ignore_missing_snapshot = ignore_missing_snapshot;
self
}
/// Set the index controller builder's dump src.
pub fn set_dump_src(&mut self, dump_src: PathBuf) -> &mut Self {
self.dump_src.replace(dump_src);
self
}
/// Set the index controller builder's dump dst.
pub fn set_dump_dst(&mut self, dump_dst: PathBuf) -> &mut Self {
self.dump_dst.replace(dump_dst);
self
}
/// Set the index controller builder's import snapshot.
pub fn set_import_snapshot(&mut self, import_snapshot: PathBuf) -> &mut Self {
self.import_snapshot.replace(import_snapshot);
self
}
2021-09-27 22:48:03 +08:00
/// Set the index controller builder's snapshot interval sec.
pub fn set_snapshot_interval(&mut self, snapshot_interval: Duration) -> &mut Self {
self.snapshot_interval = Some(snapshot_interval);
self
}
/// Set the index controller builder's schedule snapshot.
pub fn set_schedule_snapshot(&mut self) -> &mut Self {
self.schedule_snapshot = true;
self
}
}
impl IndexController {
pub fn builder() -> IndexControllerBuilder {
IndexControllerBuilder::default()
}
2021-09-29 04:22:59 +08:00
pub async fn register_update(
&self,
uid: String,
update: Update,
create_index: bool,
) -> Result<UpdateStatus> {
2021-09-24 20:55:57 +08:00
match self.index_resolver.get_uuid(uid).await {
2021-09-15 00:39:02 +08:00
Ok(uuid) => {
2021-09-27 22:48:03 +08:00
let update_result = UpdateMsg::update(&self.update_sender, uuid, update).await?;
2021-09-15 00:39:02 +08:00
Ok(update_result)
2021-09-22 21:07:04 +08:00
}
2021-09-24 17:53:11 +08:00
Err(IndexResolverError::UnexistingIndex(name)) => {
2021-09-29 00:10:09 +08:00
if create_index {
let index = self.index_resolver.create_index(name, None).await?;
2021-09-29 04:22:59 +08:00
let update_result =
UpdateMsg::update(&self.update_sender, index.uuid, update).await?;
2021-09-29 00:10:09 +08:00
Ok(update_result)
} else {
2021-09-29 04:22:59 +08:00
Err(IndexResolverError::UnexistingIndex(name).into())
2021-09-29 00:10:09 +08:00
}
2021-03-04 19:03:06 +08:00
}
Err(e) => Err(e.into()),
}
2021-03-04 19:03:06 +08:00
}
pub async fn update_status(&self, uid: String, id: u64) -> Result<UpdateStatus> {
2021-09-24 17:53:11 +08:00
let uuid = self.index_resolver.get_uuid(uid).await?;
2021-09-27 22:48:03 +08:00
let result = UpdateMsg::get_update(&self.update_sender, uuid, id).await?;
2021-03-06 17:51:52 +08:00
Ok(result)
2021-03-04 19:03:06 +08:00
}
pub async fn all_update_status(&self, uid: String) -> Result<Vec<UpdateStatus>> {
2021-09-24 17:53:11 +08:00
let uuid = self.index_resolver.get_uuid(uid).await?;
2021-09-27 22:48:03 +08:00
let result = UpdateMsg::list_updates(&self.update_sender, uuid).await?;
2021-03-06 01:34:04 +08:00
Ok(result)
2021-03-04 19:03:06 +08:00
}
pub async fn list_indexes(&self) -> Result<Vec<IndexMetadata>> {
2021-09-24 17:53:11 +08:00
let indexes = self.index_resolver.list().await?;
2021-03-07 03:12:20 +08:00
let mut ret = Vec::new();
2021-09-24 17:53:11 +08:00
for (uid, index) in indexes {
let meta = index.meta()?;
2021-03-22 17:17:38 +08:00
let meta = IndexMetadata {
2021-09-24 17:53:11 +08:00
uuid: index.uuid,
2021-03-22 17:17:38 +08:00
name: uid.clone(),
uid,
meta,
};
2021-03-15 23:52:05 +08:00
ret.push(meta);
2021-03-07 03:12:20 +08:00
}
Ok(ret)
2021-03-04 19:03:06 +08:00
}
pub async fn settings(&self, uid: String) -> Result<Settings<Checked>> {
2021-09-24 17:53:11 +08:00
let index = self.index_resolver.get_index(uid).await?;
let settings = spawn_blocking(move || index.settings()).await??;
2021-03-04 19:38:55 +08:00
Ok(settings)
}
2021-03-04 21:20:19 +08:00
pub async fn documents(
&self,
2021-03-12 05:47:29 +08:00
uid: String,
2021-03-04 21:20:19 +08:00
offset: usize,
limit: usize,
attributes_to_retrieve: Option<Vec<String>>,
) -> Result<Vec<Document>> {
2021-09-24 17:53:11 +08:00
let index = self.index_resolver.get_index(uid).await?;
2021-09-29 04:22:59 +08:00
let documents =
spawn_blocking(move || index.retrieve_documents(offset, limit, attributes_to_retrieve))
.await??;
2021-03-04 21:20:19 +08:00
Ok(documents)
}
2021-03-04 22:09:00 +08:00
pub async fn document(
&self,
2021-03-12 05:47:29 +08:00
uid: String,
2021-03-04 22:09:00 +08:00
doc_id: String,
attributes_to_retrieve: Option<Vec<String>>,
) -> Result<Document> {
2021-09-24 17:53:11 +08:00
let index = self.index_resolver.get_index(uid).await?;
2021-09-29 04:22:59 +08:00
let document =
spawn_blocking(move || index.retrieve_document(doc_id, attributes_to_retrieve))
.await??;
2021-03-04 22:09:00 +08:00
Ok(document)
}
2021-03-16 01:11:10 +08:00
pub async fn update_index(
&self,
uid: String,
2021-06-21 19:57:32 +08:00
mut index_settings: IndexSettings,
) -> Result<IndexMetadata> {
2021-09-24 17:53:11 +08:00
index_settings.uid.take();
let index = self.index_resolver.get_index(uid.clone()).await?;
let uuid = index.uuid;
2021-09-29 04:22:59 +08:00
let meta =
spawn_blocking(move || index.update_primary_key(index_settings.primary_key)).await??;
2021-03-22 17:17:38 +08:00
let meta = IndexMetadata {
2021-04-15 00:55:04 +08:00
uuid,
2021-03-22 17:17:38 +08:00
name: uid.clone(),
uid,
meta,
};
2021-03-12 21:48:43 +08:00
Ok(meta)
2021-03-04 19:03:06 +08:00
}
pub async fn search(&self, uid: String, query: SearchQuery) -> Result<SearchResult> {
2021-09-29 00:10:09 +08:00
let index = self.index_resolver.get_index(uid.clone()).await?;
let result = spawn_blocking(move || index.perform_search(query)).await??;
2021-03-04 19:03:06 +08:00
Ok(result)
}
2021-03-07 03:17:58 +08:00
pub async fn get_index(&self, uid: String) -> Result<IndexMetadata> {
2021-09-24 17:53:11 +08:00
let index = self.index_resolver.get_index(uid.clone()).await?;
let uuid = index.uuid;
let meta = spawn_blocking(move || index.meta()).await??;
2021-03-22 17:17:38 +08:00
let meta = IndexMetadata {
2021-04-15 00:55:04 +08:00
uuid,
2021-03-22 17:17:38 +08:00
name: uid.clone(),
uid,
meta,
};
2021-03-15 23:52:05 +08:00
Ok(meta)
2021-03-07 03:17:58 +08:00
}
2021-04-01 22:44:42 +08:00
pub async fn get_index_stats(&self, uid: String) -> Result<IndexStats> {
2021-09-27 22:48:03 +08:00
let update_infos = UpdateMsg::get_info(&self.update_sender).await?;
2021-09-24 17:53:11 +08:00
let index = self.index_resolver.get_index(uid).await?;
let uuid = index.uuid;
let mut stats = spawn_blocking(move || index.stats()).await??;
// Check if the currently indexing update is from our index.
2021-04-22 16:14:29 +08:00
stats.is_indexing = Some(Some(uuid) == update_infos.processing);
2021-04-15 00:55:04 +08:00
Ok(stats)
}
pub async fn get_all_stats(&self) -> Result<Stats> {
2021-09-27 22:48:03 +08:00
let update_infos = UpdateMsg::get_info(&self.update_sender).await?;
2021-09-28 18:05:22 +08:00
let mut database_size = self.index_resolver.get_uuids_size().await? + update_infos.size;
2021-04-15 00:55:04 +08:00
let mut last_update: Option<DateTime<_>> = None;
let mut indexes = BTreeMap::new();
2021-09-24 17:53:11 +08:00
for (index_uid, index) in self.index_resolver.list().await? {
let uuid = index.uuid;
let (mut stats, meta) = spawn_blocking::<_, IndexResult<_>>(move || {
let stats = index.stats()?;
let meta = index.meta()?;
Ok((stats, meta))
2021-09-29 04:22:59 +08:00
})
.await??;
2021-09-24 17:53:11 +08:00
database_size += stats.size;
2021-04-15 00:55:04 +08:00
2021-09-24 17:53:11 +08:00
last_update = last_update.map_or(Some(meta.updated_at), |last| {
Some(last.max(meta.updated_at))
2021-04-15 00:55:04 +08:00
});
2021-09-24 17:53:11 +08:00
// Check if the currently indexing update is from our index.
stats.is_indexing = Some(Some(uuid) == update_infos.processing);
2021-04-15 00:55:04 +08:00
2021-09-24 17:53:11 +08:00
indexes.insert(index_uid, stats);
2021-04-15 00:55:04 +08:00
}
Ok(Stats {
database_size,
last_update,
indexes,
})
}
2021-05-11 02:25:09 +08:00
pub async fn create_dump(&self) -> Result<DumpInfo> {
2021-05-11 02:25:09 +08:00
Ok(self.dump_handle.create_dump().await?)
}
pub async fn dump_info(&self, uid: String) -> Result<DumpInfo> {
2021-05-11 02:25:09 +08:00
Ok(self.dump_handle.dump_info(uid).await?)
}
2021-09-29 00:10:09 +08:00
2021-09-29 04:22:59 +08:00
pub async fn create_index(
&self,
uid: String,
primary_key: Option<String>,
) -> Result<IndexMetadata> {
let index = self
.index_resolver
.create_index(uid.clone(), primary_key)
.await?;
2021-09-29 00:10:09 +08:00
let meta = spawn_blocking(move || -> IndexResult<_> {
let meta = index.meta()?;
let meta = IndexMetadata {
uuid: index.uuid,
uid: uid.clone(),
name: uid,
meta,
};
Ok(meta)
2021-09-29 04:22:59 +08:00
})
.await??;
2021-09-29 00:10:09 +08:00
Ok(meta)
}
pub async fn delete_index(&self, uid: String) -> Result<()> {
let uuid = self.index_resolver.delete_index(uid).await?;
let update_sender = self.update_sender.clone();
tokio::spawn(async move {
let _ = UpdateMsg::delete(&update_sender, uuid).await;
});
Ok(())
}
2021-03-04 19:03:06 +08:00
}
2021-03-06 19:57:56 +08:00
pub async fn get_arc_ownership_blocking<T>(mut item: Arc<T>) -> T {
loop {
match Arc::try_unwrap(item) {
Ok(item) => return item,
Err(item_arc) => {
item = item_arc;
sleep(Duration::from_millis(100)).await;
continue;
}
}
}
}