2445: Seek-based tasks list r=Kerollmops a=Kerollmops

This PR implements the seek-based pagination for the tasks list following [the spec](https://github.com/meilisearch/specifications/pull/115).

Co-authored-by: Kerollmops <clement@meilisearch.com>
This commit is contained in:
bors[bot] 2022-06-02 10:25:54 +00:00 committed by GitHub
commit c9cd1738a5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 169 additions and 204 deletions

1
Cargo.lock generated
View File

@ -2115,6 +2115,7 @@ dependencies = [
"rayon", "rayon",
"regex", "regex",
"reqwest", "reqwest",
"roaring",
"rustls", "rustls",
"serde", "serde",
"serde_json", "serde_json",

View File

@ -14,6 +14,8 @@ use crate::task::{TaskListView, TaskStatus, TaskType, TaskView};
use super::{fold_star_or, StarOr}; use super::{fold_star_or, StarOr};
const DEFAULT_LIMIT: fn() -> usize = || 20;
pub fn configure(cfg: &mut web::ServiceConfig) { pub fn configure(cfg: &mut web::ServiceConfig) {
cfg.service(web::resource("").route(web::get().to(SeqHandler(get_tasks)))) cfg.service(web::resource("").route(web::get().to(SeqHandler(get_tasks))))
.service(web::resource("/{task_id}").route(web::get().to(SeqHandler(get_task)))); .service(web::resource("/{task_id}").route(web::get().to(SeqHandler(get_task))));
@ -26,6 +28,9 @@ pub struct TaskFilterQuery {
type_: Option<CS<StarOr<TaskType>>>, type_: Option<CS<StarOr<TaskType>>>,
status: Option<CS<StarOr<TaskStatus>>>, status: Option<CS<StarOr<TaskStatus>>>,
index_uid: Option<CS<StarOr<IndexUid>>>, index_uid: Option<CS<StarOr<IndexUid>>>,
#[serde(default = "DEFAULT_LIMIT")]
limit: usize,
from: Option<TaskId>,
} }
#[rustfmt::skip] #[rustfmt::skip]
@ -68,11 +73,13 @@ async fn get_tasks(
type_, type_,
status, status,
index_uid, index_uid,
limit,
from,
} = params.into_inner(); } = params.into_inner();
let search_rules = &meilisearch.filters().search_rules; let search_rules = &meilisearch.filters().search_rules;
// We first tranform a potential indexUid=* into a "not specified indexUid filter" // We first transform a potential indexUid=* into a "not specified indexUid filter"
// for every one of the filters: type, status, and indexUid. // for every one of the filters: type, status, and indexUid.
let type_ = type_.map(CS::into_inner).and_then(fold_star_or); let type_ = type_.map(CS::into_inner).and_then(fold_star_or);
let status = status.map(CS::into_inner).and_then(fold_star_or); let status = status.map(CS::into_inner).and_then(fold_star_or);
@ -128,13 +135,32 @@ async fn get_tasks(
indexes_filters indexes_filters
}; };
let tasks: TaskListView = meilisearch // We +1 just to know if there is more after this "page" or not.
.list_tasks(filters, None, None) let limit = limit.saturating_add(1);
let mut tasks_results: Vec<_> = meilisearch
.list_tasks(filters, Some(limit), from)
.await? .await?
.into_iter() .into_iter()
.map(TaskView::from) .map(TaskView::from)
.collect::<Vec<_>>() .collect();
.into();
// If we were able to fetch the number +1 tasks we asked
// it means that there is more to come.
let next = if tasks_results.len() == limit {
tasks_results.pop().map(|t| t.uid)
} else {
None
};
let from = tasks_results.first().map(|t| t.uid);
let tasks = TaskListView {
results: tasks_results,
limit: limit.saturating_sub(1),
from,
next,
};
Ok(HttpResponse::Ok().json(tasks)) Ok(HttpResponse::Ok().json(tasks))
} }

View File

@ -180,7 +180,7 @@ fn serialize_duration<S: Serializer>(
#[derive(Debug, Serialize)] #[derive(Debug, Serialize)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
pub struct TaskView { pub struct TaskView {
uid: TaskId, pub uid: TaskId,
index_uid: Option<String>, index_uid: Option<String>,
status: TaskStatus, status: TaskStatus,
#[serde(rename = "type")] #[serde(rename = "type")]
@ -369,13 +369,10 @@ impl From<Task> for TaskView {
#[derive(Debug, Serialize)] #[derive(Debug, Serialize)]
pub struct TaskListView { pub struct TaskListView {
results: Vec<TaskView>, pub results: Vec<TaskView>,
} pub limit: usize,
pub from: Option<TaskId>,
impl From<Vec<TaskView>> for TaskListView { pub next: Option<TaskId>,
fn from(results: Vec<TaskView>) -> Self {
Self { results }
}
} }
#[derive(Debug, Serialize)] #[derive(Debug, Serialize)]

View File

@ -68,7 +68,7 @@ async fn import_dump_v2_movie_raw() {
assert_eq!(code, 200); assert_eq!(code, 200);
assert_eq!( assert_eq!(
tasks, tasks,
json!({ "results": [{"uid": 0, "indexUid": "indexUID", "status": "succeeded", "type": "documentAdditionOrUpdate", "details": { "receivedDocuments": 0, "indexedDocuments": 31944 }, "duration": "PT41.751156S", "enqueuedAt": "2021-09-08T08:30:30.550282Z", "startedAt": "2021-09-08T08:30:30.553012Z", "finishedAt": "2021-09-08T08:31:12.304168Z"}]}) json!({ "results": [{"uid": 0, "indexUid": "indexUID", "status": "succeeded", "type": "documentAdditionOrUpdate", "details": { "receivedDocuments": 0, "indexedDocuments": 31944 }, "duration": "PT41.751156S", "enqueuedAt": "2021-09-08T08:30:30.550282Z", "startedAt": "2021-09-08T08:30:30.553012Z", "finishedAt": "2021-09-08T08:31:12.304168Z" }], "limit": 20, "from": 0, "next": null })
); );
// finally we're just going to check that we can still get a few documents by id // finally we're just going to check that we can still get a few documents by id
@ -132,7 +132,7 @@ async fn import_dump_v2_movie_with_settings() {
assert_eq!(code, 200); assert_eq!(code, 200);
assert_eq!( assert_eq!(
tasks, tasks,
json!({ "results": [{ "uid": 1, "indexUid": "indexUID", "status": "succeeded", "type": "settingsUpdate", "details": { "displayedAttributes": ["title", "genres", "overview", "poster", "release_date"], "searchableAttributes": ["title", "overview"], "filterableAttributes": ["genres"], "stopWords": ["of", "the"] }, "duration": "PT37.488777S", "enqueuedAt": "2021-09-08T08:24:02.323444Z", "startedAt": "2021-09-08T08:24:02.324145Z", "finishedAt": "2021-09-08T08:24:39.812922Z" }, { "uid": 0, "indexUid": "indexUID", "status": "succeeded", "type": "documentAdditionOrUpdate", "details": { "receivedDocuments": 0, "indexedDocuments": 31944 }, "duration": "PT39.941318S", "enqueuedAt": "2021-09-08T08:21:14.742672Z", "startedAt": "2021-09-08T08:21:14.750166Z", "finishedAt": "2021-09-08T08:21:54.691484Z" }]}) json!({ "results": [{ "uid": 1, "indexUid": "indexUID", "status": "succeeded", "type": "settingsUpdate", "details": { "displayedAttributes": ["title", "genres", "overview", "poster", "release_date"], "searchableAttributes": ["title", "overview"], "filterableAttributes": ["genres"], "stopWords": ["of", "the"] }, "duration": "PT37.488777S", "enqueuedAt": "2021-09-08T08:24:02.323444Z", "startedAt": "2021-09-08T08:24:02.324145Z", "finishedAt": "2021-09-08T08:24:39.812922Z" }, { "uid": 0, "indexUid": "indexUID", "status": "succeeded", "type": "documentAdditionOrUpdate", "details": { "receivedDocuments": 0, "indexedDocuments": 31944 }, "duration": "PT39.941318S", "enqueuedAt": "2021-09-08T08:21:14.742672Z", "startedAt": "2021-09-08T08:21:14.750166Z", "finishedAt": "2021-09-08T08:21:54.691484Z" }], "limit": 20, "from": 1, "next": null })
); );
// finally we're just going to check that we can still get a few documents by id // finally we're just going to check that we can still get a few documents by id
@ -198,10 +198,6 @@ async fn import_dump_v2_rubygems_with_settings() {
tasks["results"][0], tasks["results"][0],
json!({"uid": 92, "indexUid": "rubygems", "status": "succeeded", "type": "documentAdditionOrUpdate", "details": {"receivedDocuments": 0, "indexedDocuments": 1042}, "duration": "PT14.034672S", "enqueuedAt": "2021-09-08T08:40:31.390775Z", "startedAt": "2021-09-08T08:51:39.060642Z", "finishedAt": "2021-09-08T08:51:53.095314Z"}) json!({"uid": 92, "indexUid": "rubygems", "status": "succeeded", "type": "documentAdditionOrUpdate", "details": {"receivedDocuments": 0, "indexedDocuments": 1042}, "duration": "PT14.034672S", "enqueuedAt": "2021-09-08T08:40:31.390775Z", "startedAt": "2021-09-08T08:51:39.060642Z", "finishedAt": "2021-09-08T08:51:53.095314Z"})
); );
assert_eq!(
tasks["results"][92],
json!({"uid": 0, "indexUid": "rubygems", "status": "succeeded", "type": "settingsUpdate", "details": {"displayedAttributes": ["name", "summary", "description", "version", "total_downloads"], "searchableAttributes": ["name", "summary"], "filterableAttributes": ["version"], "rankingRules": ["typo", "words", "desc(fame)", "proximity", "attribute", "exactness", "desc(total_downloads)"]}, "duration": "PT0.008886S", "enqueuedAt": "2021-09-08T08:40:28.660188Z", "startedAt": "2021-09-08T08:40:28.660766Z", "finishedAt": "2021-09-08T08:40:28.669652Z"})
);
// finally we're just going to check that we can still get a few documents by id // finally we're just going to check that we can still get a few documents by id
let (document, code) = index.get_document(188040, None).await; let (document, code) = index.get_document(188040, None).await;
@ -264,7 +260,7 @@ async fn import_dump_v3_movie_raw() {
assert_eq!(code, 200); assert_eq!(code, 200);
assert_eq!( assert_eq!(
tasks, tasks,
json!({ "results": [{"uid": 0, "indexUid": "indexUID", "status": "succeeded", "type": "documentAdditionOrUpdate", "details": { "receivedDocuments": 0, "indexedDocuments": 31944 }, "duration": "PT41.751156S", "enqueuedAt": "2021-09-08T08:30:30.550282Z", "startedAt": "2021-09-08T08:30:30.553012Z", "finishedAt": "2021-09-08T08:31:12.304168Z"}]}) json!({ "results": [{"uid": 0, "indexUid": "indexUID", "status": "succeeded", "type": "documentAdditionOrUpdate", "details": { "receivedDocuments": 0, "indexedDocuments": 31944 }, "duration": "PT41.751156S", "enqueuedAt": "2021-09-08T08:30:30.550282Z", "startedAt": "2021-09-08T08:30:30.553012Z", "finishedAt": "2021-09-08T08:31:12.304168Z" }], "limit": 20, "from": 0, "next": null })
); );
// finally we're just going to check that we can still get a few documents by id // finally we're just going to check that we can still get a few documents by id
@ -328,7 +324,7 @@ async fn import_dump_v3_movie_with_settings() {
assert_eq!(code, 200); assert_eq!(code, 200);
assert_eq!( assert_eq!(
tasks, tasks,
json!({ "results": [{ "uid": 1, "indexUid": "indexUID", "status": "succeeded", "type": "settingsUpdate", "details": { "displayedAttributes": ["title", "genres", "overview", "poster", "release_date"], "searchableAttributes": ["title", "overview"], "filterableAttributes": ["genres"], "stopWords": ["of", "the"] }, "duration": "PT37.488777S", "enqueuedAt": "2021-09-08T08:24:02.323444Z", "startedAt": "2021-09-08T08:24:02.324145Z", "finishedAt": "2021-09-08T08:24:39.812922Z" }, { "uid": 0, "indexUid": "indexUID", "status": "succeeded", "type": "documentAdditionOrUpdate", "details": { "receivedDocuments": 0, "indexedDocuments": 31944 }, "duration": "PT39.941318S", "enqueuedAt": "2021-09-08T08:21:14.742672Z", "startedAt": "2021-09-08T08:21:14.750166Z", "finishedAt": "2021-09-08T08:21:54.691484Z" }]}) json!({ "results": [{ "uid": 1, "indexUid": "indexUID", "status": "succeeded", "type": "settingsUpdate", "details": { "displayedAttributes": ["title", "genres", "overview", "poster", "release_date"], "searchableAttributes": ["title", "overview"], "filterableAttributes": ["genres"], "stopWords": ["of", "the"] }, "duration": "PT37.488777S", "enqueuedAt": "2021-09-08T08:24:02.323444Z", "startedAt": "2021-09-08T08:24:02.324145Z", "finishedAt": "2021-09-08T08:24:39.812922Z" }, { "uid": 0, "indexUid": "indexUID", "status": "succeeded", "type": "documentAdditionOrUpdate", "details": { "receivedDocuments": 0, "indexedDocuments": 31944 }, "duration": "PT39.941318S", "enqueuedAt": "2021-09-08T08:21:14.742672Z", "startedAt": "2021-09-08T08:21:14.750166Z", "finishedAt": "2021-09-08T08:21:54.691484Z" }], "limit": 20, "from": 1, "next": null })
); );
// finally we're just going to check that we can["results"] still get a few documents by id // finally we're just going to check that we can["results"] still get a few documents by id
@ -394,10 +390,6 @@ async fn import_dump_v3_rubygems_with_settings() {
tasks["results"][0], tasks["results"][0],
json!({"uid": 92, "indexUid": "rubygems", "status": "succeeded", "type": "documentAdditionOrUpdate", "details": {"receivedDocuments": 0, "indexedDocuments": 1042}, "duration": "PT14.034672S", "enqueuedAt": "2021-09-08T08:40:31.390775Z", "startedAt": "2021-09-08T08:51:39.060642Z", "finishedAt": "2021-09-08T08:51:53.095314Z"}) json!({"uid": 92, "indexUid": "rubygems", "status": "succeeded", "type": "documentAdditionOrUpdate", "details": {"receivedDocuments": 0, "indexedDocuments": 1042}, "duration": "PT14.034672S", "enqueuedAt": "2021-09-08T08:40:31.390775Z", "startedAt": "2021-09-08T08:51:39.060642Z", "finishedAt": "2021-09-08T08:51:53.095314Z"})
); );
assert_eq!(
tasks["results"][92],
json!({"uid": 0, "indexUid": "rubygems", "status": "succeeded", "type": "settingsUpdate", "details": {"displayedAttributes": ["name", "summary", "description", "version", "total_downloads"], "searchableAttributes": ["name", "summary"], "filterableAttributes": ["version"], "rankingRules": ["typo", "words", "desc(fame)", "proximity", "attribute", "exactness", "desc(total_downloads)"]}, "duration": "PT0.008886S", "enqueuedAt": "2021-09-08T08:40:28.660188Z", "startedAt": "2021-09-08T08:40:28.660766Z", "finishedAt": "2021-09-08T08:40:28.669652Z"})
);
// finally we're just going to check that we can still get a few documents by id // finally we're just going to check that we can still get a few documents by id
let (document, code) = index.get_document(188040, None).await; let (document, code) = index.get_document(188040, None).await;
@ -460,7 +452,7 @@ async fn import_dump_v4_movie_raw() {
assert_eq!(code, 200); assert_eq!(code, 200);
assert_eq!( assert_eq!(
tasks, tasks,
json!({ "results": [{"uid": 0, "indexUid": "indexUID", "status": "succeeded", "type": "documentAdditionOrUpdate", "details": { "receivedDocuments": 0, "indexedDocuments": 31944 }, "duration": "PT41.751156S", "enqueuedAt": "2021-09-08T08:30:30.550282Z", "startedAt": "2021-09-08T08:30:30.553012Z", "finishedAt": "2021-09-08T08:31:12.304168Z"}]}) json!({ "results": [{"uid": 0, "indexUid": "indexUID", "status": "succeeded", "type": "documentAdditionOrUpdate", "details": { "receivedDocuments": 0, "indexedDocuments": 31944 }, "duration": "PT41.751156S", "enqueuedAt": "2021-09-08T08:30:30.550282Z", "startedAt": "2021-09-08T08:30:30.553012Z", "finishedAt": "2021-09-08T08:31:12.304168Z" }], "limit" : 20, "from": 0, "next": null })
); );
// finally we're just going to check that we can still get a few documents by id // finally we're just going to check that we can still get a few documents by id
@ -524,7 +516,7 @@ async fn import_dump_v4_movie_with_settings() {
assert_eq!(code, 200); assert_eq!(code, 200);
assert_eq!( assert_eq!(
tasks, tasks,
json!({ "results": [{ "uid": 1, "indexUid": "indexUID", "status": "succeeded", "type": "settingsUpdate", "details": { "displayedAttributes": ["title", "genres", "overview", "poster", "release_date"], "searchableAttributes": ["title", "overview"], "filterableAttributes": ["genres"], "stopWords": ["of", "the"] }, "duration": "PT37.488777S", "enqueuedAt": "2021-09-08T08:24:02.323444Z", "startedAt": "2021-09-08T08:24:02.324145Z", "finishedAt": "2021-09-08T08:24:39.812922Z" }, { "uid": 0, "indexUid": "indexUID", "status": "succeeded", "type": "documentAdditionOrUpdate", "details": { "receivedDocuments": 0, "indexedDocuments": 31944 }, "duration": "PT39.941318S", "enqueuedAt": "2021-09-08T08:21:14.742672Z", "startedAt": "2021-09-08T08:21:14.750166Z", "finishedAt": "2021-09-08T08:21:54.691484Z" }]}) json!({ "results": [{ "uid": 1, "indexUid": "indexUID", "status": "succeeded", "type": "settingsUpdate", "details": { "displayedAttributes": ["title", "genres", "overview", "poster", "release_date"], "searchableAttributes": ["title", "overview"], "filterableAttributes": ["genres"], "stopWords": ["of", "the"] }, "duration": "PT37.488777S", "enqueuedAt": "2021-09-08T08:24:02.323444Z", "startedAt": "2021-09-08T08:24:02.324145Z", "finishedAt": "2021-09-08T08:24:39.812922Z" }, { "uid": 0, "indexUid": "indexUID", "status": "succeeded", "type": "documentAdditionOrUpdate", "details": { "receivedDocuments": 0, "indexedDocuments": 31944 }, "duration": "PT39.941318S", "enqueuedAt": "2021-09-08T08:21:14.742672Z", "startedAt": "2021-09-08T08:21:14.750166Z", "finishedAt": "2021-09-08T08:21:54.691484Z" }], "limit": 20, "from": 1, "next": null })
); );
// finally we're just going to check that we can still get a few documents by id // finally we're just going to check that we can still get a few documents by id
@ -590,10 +582,6 @@ async fn import_dump_v4_rubygems_with_settings() {
tasks["results"][0], tasks["results"][0],
json!({ "uid": 92, "indexUid": "rubygems", "status": "succeeded", "type": "documentAdditionOrUpdate", "details": {"receivedDocuments": 0, "indexedDocuments": 1042}, "duration": "PT14.034672S", "enqueuedAt": "2021-09-08T08:40:31.390775Z", "startedAt": "2021-09-08T08:51:39.060642Z", "finishedAt": "2021-09-08T08:51:53.095314Z"}) json!({ "uid": 92, "indexUid": "rubygems", "status": "succeeded", "type": "documentAdditionOrUpdate", "details": {"receivedDocuments": 0, "indexedDocuments": 1042}, "duration": "PT14.034672S", "enqueuedAt": "2021-09-08T08:40:31.390775Z", "startedAt": "2021-09-08T08:51:39.060642Z", "finishedAt": "2021-09-08T08:51:53.095314Z"})
); );
assert_eq!(
tasks["results"][92],
json!({ "uid": 0, "indexUid": "rubygems", "status": "succeeded", "type": "settingsUpdate", "details": {"displayedAttributes": ["name", "summary", "description", "version", "total_downloads"], "searchableAttributes": ["name", "summary"], "filterableAttributes": ["version"], "rankingRules": ["typo", "words", "desc(fame)", "proximity", "attribute", "exactness", "desc(total_downloads)"]}, "duration": "PT0.008886S", "enqueuedAt": "2021-09-08T08:40:28.660188Z", "startedAt": "2021-09-08T08:40:28.660766Z", "finishedAt": "2021-09-08T08:40:28.669652Z"})
);
// finally we're just going to check that we can still get a few documents by id // finally we're just going to check that we can still get a few documents by id
let (document, code) = index.get_document(188040, None).await; let (document, code) = index.get_document(188040, None).await;

View File

@ -41,6 +41,7 @@ rand = "0.8.5"
rayon = "1.5.1" rayon = "1.5.1"
regex = "1.5.5" regex = "1.5.5"
reqwest = { version = "0.11.9", features = ["json", "rustls-tls"], default-features = false, optional = true } reqwest = { version = "0.11.9", features = ["json", "rustls-tls"], default-features = false, optional = true }
roaring = "0.9.0"
rustls = "0.20.4" rustls = "0.20.4"
serde = { version = "1.0.136", features = ["derive"] } serde = { version = "1.0.136", features = ["derive"] }
serde_json = { version = "1.0.79", features = ["preserve_order"] } serde_json = { version = "1.0.79", features = ["preserve_order"] }

View File

@ -4,7 +4,7 @@ use crate::snapshot::SnapshotJob;
use super::task::{Task, TaskEvent}; use super::task::{Task, TaskEvent};
pub type BatchId = u64; pub type BatchId = u32;
#[derive(Debug)] #[derive(Debug)]
pub enum BatchContent { pub enum BatchContent {

View File

@ -342,18 +342,10 @@ impl Scheduler {
} }
async fn fetch_pending_tasks(&mut self) -> Result<()> { async fn fetch_pending_tasks(&mut self) -> Result<()> {
// We must NEVER re-enqueue an already processed task! It's content uuid would point to an unexisting file.
//
// TODO(marin): This may create some latency when the first batch lazy loads the pending updates.
let mut filter = TaskFilter::default();
filter.filter_fn(|task| !task.is_finished());
self.store self.store
.list_tasks(Some(self.next_fetched_task_id), Some(filter), None) .fetch_unfinished_tasks(Some(self.next_fetched_task_id))
.await? .await?
.into_iter() .into_iter()
// The tasks arrive in reverse order, and we need to insert them in order.
.rev()
.for_each(|t| { .for_each(|t| {
self.next_fetched_task_id = t.id + 1; self.next_fetched_task_id = t.id + 1;
self.register_task(t); self.register_task(t);

View File

@ -10,7 +10,7 @@ use crate::{
index_resolver::IndexUid, index_resolver::IndexUid,
}; };
pub type TaskId = u64; pub type TaskId = u32;
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] #[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
#[cfg_attr(test, derive(proptest_derive::Arbitrary))] #[cfg_attr(test, derive(proptest_derive::Arbitrary))]

View File

@ -41,6 +41,10 @@ impl TaskFilter {
} }
} }
fn filtered_indexes(&self) -> Option<&HashSet<String>> {
self.indexes.as_ref()
}
/// Adds an index to the filter, so the filter must match this index. /// Adds an index to the filter, so the filter must match this index.
pub fn filter_index(&mut self, index: String) { pub fn filter_index(&mut self, index: String) {
self.indexes self.indexes
@ -186,6 +190,17 @@ impl TaskStore {
Ok(tasks) Ok(tasks)
} }
pub async fn fetch_unfinished_tasks(&self, offset: Option<TaskId>) -> Result<Vec<Task>> {
let store = self.store.clone();
tokio::task::spawn_blocking(move || {
let txn = store.rtxn()?;
let tasks = store.fetch_unfinished_tasks(&txn, offset)?;
Ok(tasks)
})
.await?
}
pub async fn list_tasks( pub async fn list_tasks(
&self, &self,
offset: Option<TaskId>, offset: Option<TaskId>,
@ -325,6 +340,13 @@ pub mod test {
} }
} }
pub async fn fetch_unfinished_tasks(&self, from: Option<TaskId>) -> Result<Vec<Task>> {
match self {
Self::Real(s) => s.fetch_unfinished_tasks(from).await,
Self::Mock(m) => unsafe { m.get("fetch_unfinished_tasks").call(from) },
}
}
pub async fn list_tasks( pub async fn list_tasks(
&self, &self,
from: Option<TaskId>, from: Option<TaskId>,
@ -378,7 +400,7 @@ pub mod test {
let mut runner = TestRunner::new(Config::default()); let mut runner = TestRunner::new(Config::default());
runner runner
.run(&(0..100u64).prop_map(gen_task), |task| { .run(&(0..100u32).prop_map(gen_task), |task| {
let mut txn = store.wtxn().unwrap(); let mut txn = store.wtxn().unwrap();
let previous_id = store.next_task_id(&mut txn).unwrap(); let previous_id = store.next_task_id(&mut txn).unwrap();

View File

@ -1,62 +1,30 @@
#[allow(clippy::upper_case_acronyms)] #[allow(clippy::upper_case_acronyms)]
type BEU64 = milli::heed::zerocopy::U64<milli::heed::byteorder::BE>;
const UID_TASK_IDS: &str = "uid_task_id"; type BEU32 = milli::heed::zerocopy::U32<milli::heed::byteorder::BE>;
const INDEX_UIDS_TASK_IDS: &str = "index-uids-task-ids";
const TASKS: &str = "tasks"; const TASKS: &str = "tasks";
use std::borrow::Cow; use std::collections::HashSet;
use std::collections::BinaryHeap; use std::ops::Bound::{Excluded, Unbounded};
use std::convert::TryInto;
use std::mem::size_of;
use std::ops::Range;
use std::result::Result as StdResult; use std::result::Result as StdResult;
use std::sync::Arc; use std::sync::Arc;
use milli::heed::types::{ByteSlice, OwnedType, SerdeJson, Unit}; use milli::heed::types::{OwnedType, SerdeJson, Str};
use milli::heed::{BytesDecode, BytesEncode, Database, Env, RoTxn, RwTxn}; use milli::heed::{Database, Env, RoTxn, RwTxn};
use milli::heed_codec::RoaringBitmapCodec;
use roaring::RoaringBitmap;
use crate::tasks::task::{Task, TaskId}; use crate::tasks::task::{Task, TaskId};
use super::super::Result; use super::super::Result;
use super::TaskFilter; use super::TaskFilter;
enum IndexUidTaskIdCodec {}
impl<'a> BytesEncode<'a> for IndexUidTaskIdCodec {
type EItem = (&'a str, TaskId);
fn bytes_encode((s, id): &'a Self::EItem) -> Option<Cow<'a, [u8]>> {
let size = s.len() + std::mem::size_of::<TaskId>() + 1;
if size > 512 {
return None;
}
let mut b = Vec::with_capacity(size);
b.extend_from_slice(s.as_bytes());
// null terminate the string
b.push(0);
b.extend_from_slice(&id.to_be_bytes());
Some(Cow::Owned(b))
}
}
impl<'a> BytesDecode<'a> for IndexUidTaskIdCodec {
type DItem = (&'a str, TaskId);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let len = bytes.len();
let s_end = len.checked_sub(size_of::<TaskId>())?.checked_sub(1)?;
let str_bytes = &bytes[..s_end];
let str = std::str::from_utf8(str_bytes).ok()?;
let id = TaskId::from_be_bytes(bytes[(len - size_of::<TaskId>())..].try_into().ok()?);
Some((str, id))
}
}
pub struct Store { pub struct Store {
env: Arc<Env>, env: Arc<Env>,
uids_task_ids: Database<IndexUidTaskIdCodec, Unit>, /// Maps an index uid to the set of tasks ids associated to it.
tasks: Database<OwnedType<BEU64>, SerdeJson<Task>>, index_uid_task_ids: Database<Str, RoaringBitmapCodec>,
tasks: Database<OwnedType<BEU32>, SerdeJson<Task>>,
} }
impl Drop for Store { impl Drop for Store {
@ -74,12 +42,12 @@ impl Store {
/// You want to patch all un-finished tasks and put them in your pending /// You want to patch all un-finished tasks and put them in your pending
/// queue with the `reset_and_return_unfinished_update` method. /// queue with the `reset_and_return_unfinished_update` method.
pub fn new(env: Arc<milli::heed::Env>) -> Result<Self> { pub fn new(env: Arc<milli::heed::Env>) -> Result<Self> {
let uids_task_ids = env.create_database(Some(UID_TASK_IDS))?; let index_uid_task_ids = env.create_database(Some(INDEX_UIDS_TASK_IDS))?;
let tasks = env.create_database(Some(TASKS))?; let tasks = env.create_database(Some(TASKS))?;
Ok(Self { Ok(Self {
env, env,
uids_task_ids, index_uid_task_ids,
tasks, tasks,
}) })
} }
@ -107,121 +75,104 @@ impl Store {
} }
pub fn put(&self, txn: &mut RwTxn, task: &Task) -> Result<()> { pub fn put(&self, txn: &mut RwTxn, task: &Task) -> Result<()> {
self.tasks.put(txn, &BEU64::new(task.id), task)?; self.tasks.put(txn, &BEU32::new(task.id), task)?;
// only add the task to the indexes index if it has an index_uid // only add the task to the indexes index if it has an index_uid
if let Some(ref index_uid) = task.index_uid { if let Some(index_uid) = &task.index_uid {
self.uids_task_ids.put(txn, &(index_uid, task.id), &())?; let mut tasks_set = self
.index_uid_task_ids
.get(txn, index_uid)?
.unwrap_or_default();
tasks_set.insert(task.id);
self.index_uid_task_ids.put(txn, index_uid, &tasks_set)?;
} }
Ok(()) Ok(())
} }
pub fn get(&self, txn: &RoTxn, id: TaskId) -> Result<Option<Task>> { pub fn get(&self, txn: &RoTxn, id: TaskId) -> Result<Option<Task>> {
let task = self.tasks.get(txn, &BEU64::new(id))?; let task = self.tasks.get(txn, &BEU32::new(id))?;
Ok(task) Ok(task)
} }
pub fn list_tasks<'a>( /// Returns the unfinished tasks starting from the given taskId in ascending order.
pub fn fetch_unfinished_tasks(&self, txn: &RoTxn, from: Option<TaskId>) -> Result<Vec<Task>> {
// We must NEVER re-enqueue an already processed task! It's content uuid would point to an unexisting file.
//
// TODO(marin): This may create some latency when the first batch lazy loads the pending updates.
let from = from.unwrap_or_default();
let result: StdResult<Vec<_>, milli::heed::Error> = self
.tasks
.range(txn, &(BEU32::new(from)..))?
.map(|r| r.map(|(_, t)| t))
.filter(|result| result.as_ref().map_or(true, |t| !t.is_finished()))
.collect();
result.map_err(Into::into)
}
/// Returns all the tasks starting from the given taskId and going in descending order.
pub fn list_tasks(
&self, &self,
txn: &'a RoTxn, txn: &RoTxn,
from: Option<TaskId>, from: Option<TaskId>,
filter: Option<TaskFilter>, filter: Option<TaskFilter>,
limit: Option<usize>, limit: Option<usize>,
) -> Result<Vec<Task>> { ) -> Result<Vec<Task>> {
let from = from.unwrap_or_default(); let from = match from {
let range = from..limit Some(from) => from,
.map(|limit| (limit as u64).saturating_add(from)) None => self.tasks.last(txn)?.map_or(0, |(id, _)| id.get()),
.unwrap_or(u64::MAX);
let iter: Box<dyn Iterator<Item = StdResult<_, milli::heed::Error>>> = match filter {
Some(
ref filter @ TaskFilter {
indexes: Some(_), ..
},
) => {
let iter = self
.compute_candidates(txn, filter, range)?
.into_iter()
.filter_map(|id| self.tasks.get(txn, &BEU64::new(id)).transpose());
Box::new(iter)
}
_ => Box::new(
self.tasks
.rev_range(txn, &(BEU64::new(range.start)..BEU64::new(range.end)))?
.map(|r| r.map(|(_, t)| t)),
),
}; };
let apply_fitler = |task: &StdResult<_, milli::heed::Error>| match task { let filter_fn = |task: &Task| {
Ok(ref t) => filter filter
.as_ref() .as_ref()
.and_then(|filter| filter.filter_fn.as_ref()) .and_then(|f| f.filter_fn.as_ref())
.map(|f| f(t)) .map_or(true, |f| f(task))
.unwrap_or(true),
Err(_) => true,
}; };
// Collect 'limit' task if it exists or all of them.
let tasks = iter let result: Result<Vec<_>> = match filter.as_ref().and_then(|f| f.filtered_indexes()) {
.filter(apply_fitler) Some(indexes) => self
.compute_candidates(txn, indexes, from)?
.filter(|result| result.as_ref().map_or(true, filter_fn))
.take(limit.unwrap_or(usize::MAX)) .take(limit.unwrap_or(usize::MAX))
.try_fold::<_, _, StdResult<_, milli::heed::Error>>(Vec::new(), |mut v, task| { .collect(),
v.push(task?); None => self
Ok(v) .tasks
})?; .rev_range(txn, &(..=BEU32::new(from)))?
.map(|r| r.map(|(_, t)| t).map_err(Into::into))
.filter(|result| result.as_ref().map_or(true, filter_fn))
.take(limit.unwrap_or(usize::MAX))
.collect(),
};
Ok(tasks) result.map_err(Into::into)
} }
fn compute_candidates( fn compute_candidates<'a>(
&self, &'a self,
txn: &milli::heed::RoTxn, txn: &'a RoTxn,
filter: &TaskFilter, indexes: &HashSet<String>,
range: Range<TaskId>, from: TaskId,
) -> Result<BinaryHeap<TaskId>> { ) -> Result<impl Iterator<Item = Result<Task>> + 'a> {
let mut candidates = BinaryHeap::new(); let mut candidates = RoaringBitmap::new();
if let Some(ref indexes) = filter.indexes {
for index in indexes {
// We need to prefix search the null terminated string to make sure that we only
// get exact matches for the index, and not other uids that would share the same
// prefix, i.e test and test1.
let mut index_uid = index.as_bytes().to_vec();
index_uid.push(0);
self.uids_task_ids for index_uid in indexes {
.remap_key_type::<ByteSlice>() if let Some(tasks_set) = self.index_uid_task_ids.get(txn, index_uid)? {
.rev_prefix_iter(txn, &index_uid)? candidates |= tasks_set;
.map(|entry| -> StdResult<_, milli::heed::Error> {
let (key, _) = entry?;
let (_, id) = IndexUidTaskIdCodec::bytes_decode(key)
.ok_or(milli::heed::Error::Decoding)?;
Ok(id)
})
.skip_while(|entry| {
entry
.as_ref()
.ok()
// we skip all elements till we enter in the range
.map(|key| !range.contains(key))
// if we encounter an error we returns true to collect it later
.unwrap_or(true)
})
.take_while(|entry| {
entry
.as_ref()
.ok()
// as soon as we are out of the range we exit
.map(|key| range.contains(key))
// if we encounter an error we returns true to collect it later
.unwrap_or(true)
})
.try_for_each::<_, StdResult<(), milli::heed::Error>>(|id| {
candidates.push(id?);
Ok(())
})?;
} }
} }
Ok(candidates) candidates.remove_range((Excluded(from), Unbounded));
let iter = candidates
.into_iter()
.rev()
.filter_map(|id| self.get(txn, id).transpose());
Ok(iter)
} }
} }
@ -230,8 +181,6 @@ pub mod test {
use itertools::Itertools; use itertools::Itertools;
use milli::heed::EnvOpenOptions; use milli::heed::EnvOpenOptions;
use nelson::Mocker; use nelson::Mocker;
use proptest::collection::vec;
use proptest::prelude::*;
use tempfile::TempDir; use tempfile::TempDir;
use crate::index_resolver::IndexUid; use crate::index_resolver::IndexUid;
@ -305,9 +254,20 @@ pub mod test {
} }
} }
pub fn list_tasks<'a>( pub fn fetch_unfinished_tasks(
&self, &self,
txn: &'a RoTxn, txn: &RoTxn,
from: Option<TaskId>,
) -> Result<Vec<Task>> {
match self {
MockStore::Real(index) => index.fetch_unfinished_tasks(txn, from),
MockStore::Fake(_) => todo!(),
}
}
pub fn list_tasks(
&self,
txn: &RoTxn,
from: Option<TaskId>, from: Option<TaskId>,
filter: Option<TaskFilter>, filter: Option<TaskFilter>,
limit: Option<usize>, limit: Option<usize>,
@ -429,26 +389,4 @@ pub mod test {
"test" "test"
); );
} }
proptest! {
#[test]
fn encode_decode_roundtrip(index_uid in any::<IndexUid>(), task_id in 0..TaskId::MAX) {
let value = (index_uid.as_ref(), task_id);
let bytes = IndexUidTaskIdCodec::bytes_encode(&value).unwrap();
let (index, id) = IndexUidTaskIdCodec::bytes_decode(bytes.as_ref()).unwrap();
assert_eq!(&*index_uid, index);
assert_eq!(task_id, id);
}
#[test]
fn encode_doesnt_crash(index_uid in "\\PC*", task_id in 0..TaskId::MAX) {
let value = (index_uid.as_ref(), task_id);
IndexUidTaskIdCodec::bytes_encode(&value);
}
#[test]
fn decode_doesnt_crash(bytes in vec(any::<u8>(), 0..1000)) {
IndexUidTaskIdCodec::bytes_decode(&bytes);
}
}
} }