mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-22 18:17:39 +08:00
Merge #4122
4122: Bring back changes from `release-v1.4.1` into `main` r=Kerollmops a=curquiza Co-authored-by: curquiza <curquiza@users.noreply.github.com> Co-authored-by: meili-bors[bot] <89034592+meili-bors[bot]@users.noreply.github.com> Co-authored-by: Tamo <tamo@meilisearch.com> Co-authored-by: Vivek Kumar <vivek.26@outlook.com> Co-authored-by: Clément Renault <clement@meilisearch.com>
This commit is contained in:
commit
0913373a5e
28
Cargo.lock
generated
28
Cargo.lock
generated
@ -468,7 +468,7 @@ checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "benchmarks"
|
name = "benchmarks"
|
||||||
version = "1.4.0"
|
version = "1.4.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"bytes",
|
"bytes",
|
||||||
@ -1206,7 +1206,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "dump"
|
name = "dump"
|
||||||
version = "1.4.0"
|
version = "1.4.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"big_s",
|
"big_s",
|
||||||
@ -1417,7 +1417,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "file-store"
|
name = "file-store"
|
||||||
version = "1.4.0"
|
version = "1.4.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"faux",
|
"faux",
|
||||||
"tempfile",
|
"tempfile",
|
||||||
@ -1439,7 +1439,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "filter-parser"
|
name = "filter-parser"
|
||||||
version = "1.4.0"
|
version = "1.4.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"insta",
|
"insta",
|
||||||
"nom",
|
"nom",
|
||||||
@ -1459,7 +1459,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "flatten-serde-json"
|
name = "flatten-serde-json"
|
||||||
version = "1.4.0"
|
version = "1.4.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"criterion",
|
"criterion",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
@ -1577,7 +1577,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "fuzzers"
|
name = "fuzzers"
|
||||||
version = "1.4.0"
|
version = "1.4.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arbitrary",
|
"arbitrary",
|
||||||
"clap",
|
"clap",
|
||||||
@ -1891,7 +1891,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "index-scheduler"
|
name = "index-scheduler"
|
||||||
version = "1.4.0"
|
version = "1.4.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"big_s",
|
"big_s",
|
||||||
@ -2088,7 +2088,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "json-depth-checker"
|
name = "json-depth-checker"
|
||||||
version = "1.4.0"
|
version = "1.4.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"criterion",
|
"criterion",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
@ -2500,7 +2500,7 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "meili-snap"
|
name = "meili-snap"
|
||||||
version = "1.4.0"
|
version = "1.4.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"insta",
|
"insta",
|
||||||
"md5",
|
"md5",
|
||||||
@ -2509,7 +2509,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "meilisearch"
|
name = "meilisearch"
|
||||||
version = "1.4.0"
|
version = "1.4.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"actix-cors",
|
"actix-cors",
|
||||||
"actix-http",
|
"actix-http",
|
||||||
@ -2600,7 +2600,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "meilisearch-auth"
|
name = "meilisearch-auth"
|
||||||
version = "1.4.0"
|
version = "1.4.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"base64 0.21.2",
|
"base64 0.21.2",
|
||||||
"enum-iterator",
|
"enum-iterator",
|
||||||
@ -2619,7 +2619,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "meilisearch-types"
|
name = "meilisearch-types"
|
||||||
version = "1.4.0"
|
version = "1.4.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"actix-web",
|
"actix-web",
|
||||||
"anyhow",
|
"anyhow",
|
||||||
@ -2673,7 +2673,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "milli"
|
name = "milli"
|
||||||
version = "1.4.0"
|
version = "1.4.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"big_s",
|
"big_s",
|
||||||
"bimap",
|
"bimap",
|
||||||
@ -2995,7 +2995,7 @@ checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "permissive-json-pointer"
|
name = "permissive-json-pointer"
|
||||||
version = "1.4.0"
|
version = "1.4.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"big_s",
|
"big_s",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
|
@ -18,7 +18,7 @@ members = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
[workspace.package]
|
[workspace.package]
|
||||||
version = "1.4.0"
|
version = "1.4.1"
|
||||||
authors = ["Quentin de Quelen <quentin@dequelen.me>", "Clément Renault <clement@meilisearch.com>"]
|
authors = ["Quentin de Quelen <quentin@dequelen.me>", "Clément Renault <clement@meilisearch.com>"]
|
||||||
description = "Meilisearch HTTP server"
|
description = "Meilisearch HTTP server"
|
||||||
homepage = "https://meilisearch.com"
|
homepage = "https://meilisearch.com"
|
||||||
|
63
meilisearch/tests/search/distinct.rs
Normal file
63
meilisearch/tests/search/distinct.rs
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
use meili_snap::snapshot;
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
|
|
||||||
|
use crate::common::{Server, Value};
|
||||||
|
use crate::json;
|
||||||
|
|
||||||
|
pub(self) static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
|
||||||
|
json!([
|
||||||
|
{"productId": 1, "shopId": 1},
|
||||||
|
{"productId": 2, "shopId": 1},
|
||||||
|
{"productId": 3, "shopId": 2},
|
||||||
|
{"productId": 4, "shopId": 2},
|
||||||
|
{"productId": 5, "shopId": 3},
|
||||||
|
{"productId": 6, "shopId": 3},
|
||||||
|
{"productId": 7, "shopId": 4},
|
||||||
|
{"productId": 8, "shopId": 4},
|
||||||
|
{"productId": 9, "shopId": 5},
|
||||||
|
{"productId": 10, "shopId": 5}
|
||||||
|
])
|
||||||
|
});
|
||||||
|
|
||||||
|
pub(self) static DOCUMENT_PRIMARY_KEY: &str = "productId";
|
||||||
|
pub(self) static DOCUMENT_DISTINCT_KEY: &str = "shopId";
|
||||||
|
|
||||||
|
/// testing: https://github.com/meilisearch/meilisearch/issues/4078
|
||||||
|
#[actix_rt::test]
|
||||||
|
async fn distinct_search_with_offset_no_ranking() {
|
||||||
|
let server = Server::new().await;
|
||||||
|
let index = server.index("test");
|
||||||
|
|
||||||
|
let documents = DOCUMENTS.clone();
|
||||||
|
index.add_documents(documents, Some(DOCUMENT_PRIMARY_KEY)).await;
|
||||||
|
index.update_distinct_attribute(json!(DOCUMENT_DISTINCT_KEY)).await;
|
||||||
|
index.wait_task(1).await;
|
||||||
|
|
||||||
|
fn get_hits(Value(response): Value) -> Vec<i64> {
|
||||||
|
let hits_array = response["hits"].as_array().unwrap();
|
||||||
|
hits_array.iter().map(|h| h[DOCUMENT_DISTINCT_KEY].as_i64().unwrap()).collect::<Vec<_>>()
|
||||||
|
}
|
||||||
|
|
||||||
|
let (response, code) = index.search_post(json!({"limit": 2, "offset": 0})).await;
|
||||||
|
let hits = get_hits(response);
|
||||||
|
snapshot!(code, @"200 OK");
|
||||||
|
snapshot!(hits.len(), @"2");
|
||||||
|
snapshot!(format!("{:?}", hits), @"[1, 2]");
|
||||||
|
|
||||||
|
let (response, code) = index.search_post(json!({"limit": 2, "offset": 2})).await;
|
||||||
|
let hits = get_hits(response);
|
||||||
|
snapshot!(code, @"200 OK");
|
||||||
|
snapshot!(hits.len(), @"2");
|
||||||
|
snapshot!(format!("{:?}", hits), @"[3, 4]");
|
||||||
|
|
||||||
|
let (response, code) = index.search_post(json!({"limit": 10, "offset": 4})).await;
|
||||||
|
let hits = get_hits(response);
|
||||||
|
snapshot!(code, @"200 OK");
|
||||||
|
snapshot!(hits.len(), @"1");
|
||||||
|
snapshot!(format!("{:?}", hits), @"[5]");
|
||||||
|
|
||||||
|
let (response, code) = index.search_post(json!({"limit": 10, "offset": 5})).await;
|
||||||
|
let hits = get_hits(response);
|
||||||
|
snapshot!(code, @"200 OK");
|
||||||
|
snapshot!(hits.len(), @"0");
|
||||||
|
}
|
@ -1,6 +1,7 @@
|
|||||||
// This modules contains all the test concerning search. Each particular feature of the search
|
// This modules contains all the test concerning search. Each particular feature of the search
|
||||||
// should be tested in its own module to isolate tests and keep the tests readable.
|
// should be tested in its own module to isolate tests and keep the tests readable.
|
||||||
|
|
||||||
|
mod distinct;
|
||||||
mod errors;
|
mod errors;
|
||||||
mod facet_search;
|
mod facet_search;
|
||||||
mod formatted;
|
mod formatted;
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
|
use std::io::BufReader;
|
||||||
use std::{io, str};
|
use std::{io, str};
|
||||||
|
|
||||||
use obkv::KvReader;
|
use obkv::KvReader;
|
||||||
@ -19,14 +20,14 @@ use crate::FieldId;
|
|||||||
pub struct EnrichedDocumentsBatchReader<R> {
|
pub struct EnrichedDocumentsBatchReader<R> {
|
||||||
documents: DocumentsBatchReader<R>,
|
documents: DocumentsBatchReader<R>,
|
||||||
primary_key: String,
|
primary_key: String,
|
||||||
external_ids: grenad::ReaderCursor<File>,
|
external_ids: grenad::ReaderCursor<BufReader<File>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<R: io::Read + io::Seek> EnrichedDocumentsBatchReader<R> {
|
impl<R: io::Read + io::Seek> EnrichedDocumentsBatchReader<R> {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
documents: DocumentsBatchReader<R>,
|
documents: DocumentsBatchReader<R>,
|
||||||
primary_key: String,
|
primary_key: String,
|
||||||
external_ids: grenad::Reader<File>,
|
external_ids: grenad::Reader<BufReader<File>>,
|
||||||
) -> Result<Self, Error> {
|
) -> Result<Self, Error> {
|
||||||
if documents.documents_count() as u64 == external_ids.len() {
|
if documents.documents_count() as u64 == external_ids.len() {
|
||||||
Ok(EnrichedDocumentsBatchReader {
|
Ok(EnrichedDocumentsBatchReader {
|
||||||
@ -75,7 +76,7 @@ pub struct EnrichedDocument<'a> {
|
|||||||
pub struct EnrichedDocumentsBatchCursor<R> {
|
pub struct EnrichedDocumentsBatchCursor<R> {
|
||||||
documents: DocumentsBatchCursor<R>,
|
documents: DocumentsBatchCursor<R>,
|
||||||
primary_key: String,
|
primary_key: String,
|
||||||
external_ids: grenad::ReaderCursor<File>,
|
external_ids: grenad::ReaderCursor<BufReader<File>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<R> EnrichedDocumentsBatchCursor<R> {
|
impl<R> EnrichedDocumentsBatchCursor<R> {
|
||||||
|
@ -46,18 +46,27 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
|||||||
if let Some(distinct_fid) = distinct_fid {
|
if let Some(distinct_fid) = distinct_fid {
|
||||||
let mut excluded = RoaringBitmap::new();
|
let mut excluded = RoaringBitmap::new();
|
||||||
let mut results = vec![];
|
let mut results = vec![];
|
||||||
|
let mut skip = 0;
|
||||||
for docid in universe.iter() {
|
for docid in universe.iter() {
|
||||||
if results.len() >= from + length {
|
if results.len() >= length {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if excluded.contains(docid) {
|
if excluded.contains(docid) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
distinct_single_docid(ctx.index, ctx.txn, distinct_fid, docid, &mut excluded)?;
|
distinct_single_docid(ctx.index, ctx.txn, distinct_fid, docid, &mut excluded)?;
|
||||||
|
skip += 1;
|
||||||
|
if skip <= from {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
results.push(docid);
|
results.push(docid);
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut all_candidates = universe - excluded;
|
let mut all_candidates = universe - excluded;
|
||||||
all_candidates.extend(results.iter().copied());
|
all_candidates.extend(results.iter().copied());
|
||||||
|
|
||||||
return Ok(BucketSortOutput {
|
return Ok(BucketSortOutput {
|
||||||
scores: vec![Default::default(); results.len()],
|
scores: vec![Default::default(); results.len()],
|
||||||
docids: results,
|
docids: results,
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
|
use std::io::BufReader;
|
||||||
|
|
||||||
use grenad::CompressionType;
|
use grenad::CompressionType;
|
||||||
use heed::types::ByteSlice;
|
use heed::types::ByteSlice;
|
||||||
@ -30,7 +31,7 @@ pub struct FacetsUpdateBulk<'i> {
|
|||||||
facet_type: FacetType,
|
facet_type: FacetType,
|
||||||
field_ids: Vec<FieldId>,
|
field_ids: Vec<FieldId>,
|
||||||
// None if level 0 does not need to be updated
|
// None if level 0 does not need to be updated
|
||||||
new_data: Option<grenad::Reader<File>>,
|
new_data: Option<grenad::Reader<BufReader<File>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'i> FacetsUpdateBulk<'i> {
|
impl<'i> FacetsUpdateBulk<'i> {
|
||||||
@ -38,7 +39,7 @@ impl<'i> FacetsUpdateBulk<'i> {
|
|||||||
index: &'i Index,
|
index: &'i Index,
|
||||||
field_ids: Vec<FieldId>,
|
field_ids: Vec<FieldId>,
|
||||||
facet_type: FacetType,
|
facet_type: FacetType,
|
||||||
new_data: grenad::Reader<File>,
|
new_data: grenad::Reader<BufReader<File>>,
|
||||||
group_size: u8,
|
group_size: u8,
|
||||||
min_level_size: u8,
|
min_level_size: u8,
|
||||||
) -> FacetsUpdateBulk<'i> {
|
) -> FacetsUpdateBulk<'i> {
|
||||||
@ -187,7 +188,7 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
|||||||
&self,
|
&self,
|
||||||
field_id: FieldId,
|
field_id: FieldId,
|
||||||
txn: &RoTxn,
|
txn: &RoTxn,
|
||||||
) -> Result<(Vec<grenad::Reader<File>>, RoaringBitmap)> {
|
) -> Result<(Vec<grenad::Reader<BufReader<File>>>, RoaringBitmap)> {
|
||||||
let mut all_docids = RoaringBitmap::new();
|
let mut all_docids = RoaringBitmap::new();
|
||||||
let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |bitmaps, _| {
|
let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |bitmaps, _| {
|
||||||
for bitmap in bitmaps {
|
for bitmap in bitmaps {
|
||||||
@ -259,7 +260,7 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
|||||||
field_id: u16,
|
field_id: u16,
|
||||||
level: u8,
|
level: u8,
|
||||||
handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>,
|
handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>,
|
||||||
) -> Result<Vec<grenad::Reader<File>>> {
|
) -> Result<Vec<grenad::Reader<BufReader<File>>>> {
|
||||||
if level == 0 {
|
if level == 0 {
|
||||||
self.read_level_0(rtxn, field_id, handle_group)?;
|
self.read_level_0(rtxn, field_id, handle_group)?;
|
||||||
// Level 0 is already in the database
|
// Level 0 is already in the database
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
|
use std::io::BufReader;
|
||||||
|
|
||||||
use heed::types::{ByteSlice, DecodeIgnore};
|
use heed::types::{ByteSlice, DecodeIgnore};
|
||||||
use heed::{BytesDecode, Error, RoTxn, RwTxn};
|
use heed::{BytesDecode, Error, RoTxn, RwTxn};
|
||||||
@ -34,14 +35,14 @@ pub struct FacetsUpdateIncremental<'i> {
|
|||||||
index: &'i Index,
|
index: &'i Index,
|
||||||
inner: FacetsUpdateIncrementalInner,
|
inner: FacetsUpdateIncrementalInner,
|
||||||
facet_type: FacetType,
|
facet_type: FacetType,
|
||||||
new_data: grenad::Reader<File>,
|
new_data: grenad::Reader<BufReader<File>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'i> FacetsUpdateIncremental<'i> {
|
impl<'i> FacetsUpdateIncremental<'i> {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
index: &'i Index,
|
index: &'i Index,
|
||||||
facet_type: FacetType,
|
facet_type: FacetType,
|
||||||
new_data: grenad::Reader<File>,
|
new_data: grenad::Reader<BufReader<File>>,
|
||||||
group_size: u8,
|
group_size: u8,
|
||||||
min_level_size: u8,
|
min_level_size: u8,
|
||||||
max_group_size: u8,
|
max_group_size: u8,
|
||||||
|
@ -78,6 +78,7 @@ pub const FACET_MIN_LEVEL_SIZE: u8 = 5;
|
|||||||
|
|
||||||
use std::collections::BTreeSet;
|
use std::collections::BTreeSet;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
|
use std::io::BufReader;
|
||||||
use std::iter::FromIterator;
|
use std::iter::FromIterator;
|
||||||
|
|
||||||
use charabia::normalizer::{Normalize, NormalizerOption};
|
use charabia::normalizer::{Normalize, NormalizerOption};
|
||||||
@ -108,13 +109,17 @@ pub struct FacetsUpdate<'i> {
|
|||||||
index: &'i Index,
|
index: &'i Index,
|
||||||
database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
|
database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
|
||||||
facet_type: FacetType,
|
facet_type: FacetType,
|
||||||
new_data: grenad::Reader<File>,
|
new_data: grenad::Reader<BufReader<File>>,
|
||||||
group_size: u8,
|
group_size: u8,
|
||||||
max_group_size: u8,
|
max_group_size: u8,
|
||||||
min_level_size: u8,
|
min_level_size: u8,
|
||||||
}
|
}
|
||||||
impl<'i> FacetsUpdate<'i> {
|
impl<'i> FacetsUpdate<'i> {
|
||||||
pub fn new(index: &'i Index, facet_type: FacetType, new_data: grenad::Reader<File>) -> Self {
|
pub fn new(
|
||||||
|
index: &'i Index,
|
||||||
|
facet_type: FacetType,
|
||||||
|
new_data: grenad::Reader<BufReader<File>>,
|
||||||
|
) -> Self {
|
||||||
let database = match facet_type {
|
let database = match facet_type {
|
||||||
FacetType::String => index
|
FacetType::String => index
|
||||||
.facet_id_string_docids
|
.facet_id_string_docids
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
use std::io::{Read, Seek};
|
use std::io::{BufWriter, Read, Seek};
|
||||||
use std::result::Result as StdResult;
|
use std::result::Result as StdResult;
|
||||||
use std::{fmt, iter};
|
use std::{fmt, iter};
|
||||||
|
|
||||||
@ -35,7 +35,7 @@ pub fn enrich_documents_batch<R: Read + Seek>(
|
|||||||
|
|
||||||
let (mut cursor, mut documents_batch_index) = reader.into_cursor_and_fields_index();
|
let (mut cursor, mut documents_batch_index) = reader.into_cursor_and_fields_index();
|
||||||
|
|
||||||
let mut external_ids = tempfile::tempfile().map(grenad::Writer::new)?;
|
let mut external_ids = tempfile::tempfile().map(BufWriter::new).map(grenad::Writer::new)?;
|
||||||
let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH];
|
let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH];
|
||||||
|
|
||||||
// The primary key *field id* that has already been set for this index or the one
|
// The primary key *field id* that has already been set for this index or the one
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::{HashMap, HashSet};
|
||||||
use std::convert::TryInto;
|
use std::convert::TryInto;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
|
use std::io::BufReader;
|
||||||
use std::{io, mem, str};
|
use std::{io, mem, str};
|
||||||
|
|
||||||
use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
|
use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
|
||||||
@ -31,7 +32,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||||||
allowed_separators: Option<&[&str]>,
|
allowed_separators: Option<&[&str]>,
|
||||||
dictionary: Option<&[&str]>,
|
dictionary: Option<&[&str]>,
|
||||||
max_positions_per_attributes: Option<u32>,
|
max_positions_per_attributes: Option<u32>,
|
||||||
) -> Result<(RoaringBitmap, grenad::Reader<File>, ScriptLanguageDocidsMap)> {
|
) -> Result<(RoaringBitmap, grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> {
|
||||||
puffin::profile_function!();
|
puffin::profile_function!();
|
||||||
|
|
||||||
let max_positions_per_attributes = max_positions_per_attributes
|
let max_positions_per_attributes = max_positions_per_attributes
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::io::{self, BufReader};
|
||||||
|
|
||||||
use heed::{BytesDecode, BytesEncode};
|
use heed::{BytesDecode, BytesEncode};
|
||||||
|
|
||||||
@ -19,7 +19,7 @@ use crate::Result;
|
|||||||
pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
|
pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
|
||||||
docid_fid_facet_number: grenad::Reader<R>,
|
docid_fid_facet_number: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
) -> Result<grenad::Reader<File>> {
|
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||||
puffin::profile_function!();
|
puffin::profile_function!();
|
||||||
|
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::io::{self, BufReader};
|
||||||
|
|
||||||
use heed::BytesEncode;
|
use heed::BytesEncode;
|
||||||
|
|
||||||
@ -17,7 +17,7 @@ use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
|
|||||||
pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
||||||
docid_fid_facet_string: grenad::Reader<R>,
|
docid_fid_facet_string: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
) -> Result<grenad::Reader<File>> {
|
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||||
puffin::profile_function!();
|
puffin::profile_function!();
|
||||||
|
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
use std::collections::{BTreeMap, HashSet};
|
use std::collections::{BTreeMap, HashSet};
|
||||||
use std::convert::TryInto;
|
use std::convert::TryInto;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::io::{self, BufReader};
|
||||||
use std::mem::size_of;
|
use std::mem::size_of;
|
||||||
|
|
||||||
use heed::zerocopy::AsBytes;
|
use heed::zerocopy::AsBytes;
|
||||||
@ -17,11 +17,11 @@ use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET
|
|||||||
|
|
||||||
/// The extracted facet values stored in grenad files by type.
|
/// The extracted facet values stored in grenad files by type.
|
||||||
pub struct ExtractedFacetValues {
|
pub struct ExtractedFacetValues {
|
||||||
pub docid_fid_facet_numbers_chunk: grenad::Reader<File>,
|
pub docid_fid_facet_numbers_chunk: grenad::Reader<BufReader<File>>,
|
||||||
pub docid_fid_facet_strings_chunk: grenad::Reader<File>,
|
pub docid_fid_facet_strings_chunk: grenad::Reader<BufReader<File>>,
|
||||||
pub fid_facet_is_null_docids_chunk: grenad::Reader<File>,
|
pub fid_facet_is_null_docids_chunk: grenad::Reader<BufReader<File>>,
|
||||||
pub fid_facet_is_empty_docids_chunk: grenad::Reader<File>,
|
pub fid_facet_is_empty_docids_chunk: grenad::Reader<BufReader<File>>,
|
||||||
pub fid_facet_exists_docids_chunk: grenad::Reader<File>,
|
pub fid_facet_exists_docids_chunk: grenad::Reader<BufReader<File>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extracts the facet values of each faceted field of each document.
|
/// Extracts the facet values of each faceted field of each document.
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::io::{self, BufReader};
|
||||||
|
|
||||||
use grenad::Sorter;
|
use grenad::Sorter;
|
||||||
|
|
||||||
@ -21,7 +21,7 @@ use crate::{relative_from_absolute_position, DocumentId, FieldId, Result};
|
|||||||
pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
|
pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
|
||||||
docid_word_positions: grenad::Reader<R>,
|
docid_word_positions: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
) -> Result<grenad::Reader<File>> {
|
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||||
puffin::profile_function!();
|
puffin::profile_function!();
|
||||||
|
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::io::{self, BufReader};
|
||||||
|
|
||||||
use concat_arrays::concat_arrays;
|
use concat_arrays::concat_arrays;
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
@ -18,7 +18,7 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
|
|||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
primary_key_id: FieldId,
|
primary_key_id: FieldId,
|
||||||
(lat_fid, lng_fid): (FieldId, FieldId),
|
(lat_fid, lng_fid): (FieldId, FieldId),
|
||||||
) -> Result<grenad::Reader<File>> {
|
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||||
puffin::profile_function!();
|
puffin::profile_function!();
|
||||||
|
|
||||||
let mut writer = create_writer(
|
let mut writer = create_writer(
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
use std::convert::TryFrom;
|
use std::convert::TryFrom;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::io::{self, BufReader};
|
||||||
|
|
||||||
use bytemuck::cast_slice;
|
use bytemuck::cast_slice;
|
||||||
use serde_json::{from_slice, Value};
|
use serde_json::{from_slice, Value};
|
||||||
@ -18,7 +18,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
primary_key_id: FieldId,
|
primary_key_id: FieldId,
|
||||||
vectors_fid: FieldId,
|
vectors_fid: FieldId,
|
||||||
) -> Result<grenad::Reader<File>> {
|
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||||
puffin::profile_function!();
|
puffin::profile_function!();
|
||||||
|
|
||||||
let mut writer = create_writer(
|
let mut writer = create_writer(
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::io::{self, BufReader};
|
||||||
use std::iter::FromIterator;
|
use std::iter::FromIterator;
|
||||||
|
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
@ -26,7 +26,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
|||||||
docid_word_positions: grenad::Reader<R>,
|
docid_word_positions: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
exact_attributes: &HashSet<FieldId>,
|
exact_attributes: &HashSet<FieldId>,
|
||||||
) -> Result<(grenad::Reader<File>, grenad::Reader<File>)> {
|
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
||||||
puffin::profile_function!();
|
puffin::profile_function!();
|
||||||
|
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::io::{self, BufReader};
|
||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
|
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
|
||||||
@ -14,7 +14,7 @@ use crate::{relative_from_absolute_position, DocumentId, Result};
|
|||||||
pub fn extract_word_fid_docids<R: io::Read + io::Seek>(
|
pub fn extract_word_fid_docids<R: io::Read + io::Seek>(
|
||||||
docid_word_positions: grenad::Reader<R>,
|
docid_word_positions: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
) -> Result<grenad::Reader<File>> {
|
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||||
puffin::profile_function!();
|
puffin::profile_function!();
|
||||||
|
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::collections::{BinaryHeap, HashMap};
|
use std::collections::{BinaryHeap, HashMap};
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
|
use std::io::BufReader;
|
||||||
use std::{cmp, io, mem, str, vec};
|
use std::{cmp, io, mem, str, vec};
|
||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
@ -20,7 +21,7 @@ use crate::{DocumentId, Result};
|
|||||||
pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||||
docid_word_positions: grenad::Reader<R>,
|
docid_word_positions: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
) -> Result<grenad::Reader<File>> {
|
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||||
puffin::profile_function!();
|
puffin::profile_function!();
|
||||||
|
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::io::{self, BufReader};
|
||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
|
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
|
||||||
@ -17,7 +17,7 @@ use crate::{bucketed_position, relative_from_absolute_position, DocumentId, Resu
|
|||||||
pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
||||||
docid_word_positions: grenad::Reader<R>,
|
docid_word_positions: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
) -> Result<grenad::Reader<File>> {
|
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||||
puffin::profile_function!();
|
puffin::profile_function!();
|
||||||
|
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
@ -12,6 +12,7 @@ mod extract_word_position_docids;
|
|||||||
|
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
|
use std::io::BufReader;
|
||||||
|
|
||||||
use crossbeam_channel::Sender;
|
use crossbeam_channel::Sender;
|
||||||
use log::debug;
|
use log::debug;
|
||||||
@ -39,8 +40,8 @@ use crate::{FieldId, Result};
|
|||||||
/// Send data in grenad file over provided Sender.
|
/// Send data in grenad file over provided Sender.
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
pub(crate) fn data_from_obkv_documents(
|
pub(crate) fn data_from_obkv_documents(
|
||||||
original_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<File>>> + Send,
|
original_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send,
|
||||||
flattened_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<File>>> + Send,
|
flattened_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||||
searchable_fields: Option<HashSet<FieldId>>,
|
searchable_fields: Option<HashSet<FieldId>>,
|
||||||
@ -152,7 +153,7 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||||
docid_word_positions_chunks.clone(),
|
docid_word_positions_chunks.clone(),
|
||||||
indexer,
|
indexer,
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
@ -162,7 +163,7 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
"word-pair-proximity-docids",
|
"word-pair-proximity-docids",
|
||||||
);
|
);
|
||||||
|
|
||||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||||
docid_word_positions_chunks.clone(),
|
docid_word_positions_chunks.clone(),
|
||||||
indexer,
|
indexer,
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
@ -172,7 +173,11 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
"field-id-wordcount-docids",
|
"field-id-wordcount-docids",
|
||||||
);
|
);
|
||||||
|
|
||||||
spawn_extraction_task::<_, _, Vec<(grenad::Reader<File>, grenad::Reader<File>)>>(
|
spawn_extraction_task::<
|
||||||
|
_,
|
||||||
|
_,
|
||||||
|
Vec<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)>,
|
||||||
|
>(
|
||||||
docid_word_positions_chunks.clone(),
|
docid_word_positions_chunks.clone(),
|
||||||
indexer,
|
indexer,
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
@ -185,7 +190,7 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
"word-docids",
|
"word-docids",
|
||||||
);
|
);
|
||||||
|
|
||||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||||
docid_word_positions_chunks.clone(),
|
docid_word_positions_chunks.clone(),
|
||||||
indexer,
|
indexer,
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
@ -194,7 +199,7 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
TypedChunk::WordPositionDocids,
|
TypedChunk::WordPositionDocids,
|
||||||
"word-position-docids",
|
"word-position-docids",
|
||||||
);
|
);
|
||||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||||
docid_word_positions_chunks,
|
docid_word_positions_chunks,
|
||||||
indexer,
|
indexer,
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
@ -204,7 +209,7 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
"word-fid-docids",
|
"word-fid-docids",
|
||||||
);
|
);
|
||||||
|
|
||||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||||
docid_fid_facet_strings_chunks,
|
docid_fid_facet_strings_chunks,
|
||||||
indexer,
|
indexer,
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
@ -214,7 +219,7 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
"field-id-facet-string-docids",
|
"field-id-facet-string-docids",
|
||||||
);
|
);
|
||||||
|
|
||||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||||
docid_fid_facet_numbers_chunks,
|
docid_fid_facet_numbers_chunks,
|
||||||
indexer,
|
indexer,
|
||||||
lmdb_writer_sx,
|
lmdb_writer_sx,
|
||||||
@ -269,7 +274,7 @@ fn spawn_extraction_task<FE, FS, M>(
|
|||||||
/// Extract chunked data and send it into lmdb_writer_sx sender:
|
/// Extract chunked data and send it into lmdb_writer_sx sender:
|
||||||
/// - documents
|
/// - documents
|
||||||
fn send_original_documents_data(
|
fn send_original_documents_data(
|
||||||
original_documents_chunk: Result<grenad::Reader<File>>,
|
original_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||||
vectors_field_id: Option<FieldId>,
|
vectors_field_id: Option<FieldId>,
|
||||||
@ -311,7 +316,7 @@ fn send_original_documents_data(
|
|||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
#[allow(clippy::type_complexity)]
|
#[allow(clippy::type_complexity)]
|
||||||
fn send_and_extract_flattened_documents_data(
|
fn send_and_extract_flattened_documents_data(
|
||||||
flattened_documents_chunk: Result<grenad::Reader<File>>,
|
flattened_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||||
searchable_fields: &Option<HashSet<FieldId>>,
|
searchable_fields: &Option<HashSet<FieldId>>,
|
||||||
@ -328,7 +333,10 @@ fn send_and_extract_flattened_documents_data(
|
|||||||
grenad::Reader<CursorClonableMmap>,
|
grenad::Reader<CursorClonableMmap>,
|
||||||
(
|
(
|
||||||
grenad::Reader<CursorClonableMmap>,
|
grenad::Reader<CursorClonableMmap>,
|
||||||
(grenad::Reader<File>, (grenad::Reader<File>, grenad::Reader<File>)),
|
(
|
||||||
|
grenad::Reader<BufReader<File>>,
|
||||||
|
(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>),
|
||||||
|
),
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
)> {
|
)> {
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{self, Seek};
|
use std::io::{self, BufReader, BufWriter, Seek};
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
|
|
||||||
use grenad::{CompressionType, Sorter};
|
use grenad::{CompressionType, Sorter};
|
||||||
@ -17,13 +17,13 @@ pub fn create_writer<R: io::Write>(
|
|||||||
typ: grenad::CompressionType,
|
typ: grenad::CompressionType,
|
||||||
level: Option<u32>,
|
level: Option<u32>,
|
||||||
file: R,
|
file: R,
|
||||||
) -> grenad::Writer<R> {
|
) -> grenad::Writer<BufWriter<R>> {
|
||||||
let mut builder = grenad::Writer::builder();
|
let mut builder = grenad::Writer::builder();
|
||||||
builder.compression_type(typ);
|
builder.compression_type(typ);
|
||||||
if let Some(level) = level {
|
if let Some(level) = level {
|
||||||
builder.compression_level(level);
|
builder.compression_level(level);
|
||||||
}
|
}
|
||||||
builder.build(file)
|
builder.build(BufWriter::new(file))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn create_sorter(
|
pub fn create_sorter(
|
||||||
@ -53,7 +53,7 @@ pub fn create_sorter(
|
|||||||
pub fn sorter_into_reader(
|
pub fn sorter_into_reader(
|
||||||
sorter: grenad::Sorter<MergeFn>,
|
sorter: grenad::Sorter<MergeFn>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
) -> Result<grenad::Reader<File>> {
|
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||||
let mut writer = create_writer(
|
let mut writer = create_writer(
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
@ -64,16 +64,18 @@ pub fn sorter_into_reader(
|
|||||||
writer_into_reader(writer)
|
writer_into_reader(writer)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn writer_into_reader(writer: grenad::Writer<File>) -> Result<grenad::Reader<File>> {
|
pub fn writer_into_reader(
|
||||||
let mut file = writer.into_inner()?;
|
writer: grenad::Writer<BufWriter<File>>,
|
||||||
|
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||||
|
let mut file = writer.into_inner()?.into_inner().map_err(|err| err.into_error())?;
|
||||||
file.rewind()?;
|
file.rewind()?;
|
||||||
grenad::Reader::new(file).map_err(Into::into)
|
grenad::Reader::new(BufReader::new(file)).map_err(Into::into)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub unsafe fn as_cloneable_grenad(
|
pub unsafe fn as_cloneable_grenad(
|
||||||
reader: &grenad::Reader<File>,
|
reader: &grenad::Reader<BufReader<File>>,
|
||||||
) -> Result<grenad::Reader<CursorClonableMmap>> {
|
) -> Result<grenad::Reader<CursorClonableMmap>> {
|
||||||
let file = reader.get_ref();
|
let file = reader.get_ref().get_ref();
|
||||||
let mmap = memmap2::Mmap::map(file)?;
|
let mmap = memmap2::Mmap::map(file)?;
|
||||||
let cursor = io::Cursor::new(ClonableMmap::from(mmap));
|
let cursor = io::Cursor::new(ClonableMmap::from(mmap));
|
||||||
let reader = grenad::Reader::new(cursor)?;
|
let reader = grenad::Reader::new(cursor)?;
|
||||||
@ -89,8 +91,8 @@ where
|
|||||||
fn merge(self, merge_fn: MergeFn, indexer: &GrenadParameters) -> Result<Self::Output>;
|
fn merge(self, merge_fn: MergeFn, indexer: &GrenadParameters) -> Result<Self::Output>;
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MergeableReader for Vec<grenad::Reader<File>> {
|
impl MergeableReader for Vec<grenad::Reader<BufReader<File>>> {
|
||||||
type Output = grenad::Reader<File>;
|
type Output = grenad::Reader<BufReader<File>>;
|
||||||
|
|
||||||
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
|
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
|
||||||
let mut merger = MergerBuilder::new(merge_fn);
|
let mut merger = MergerBuilder::new(merge_fn);
|
||||||
@ -99,8 +101,8 @@ impl MergeableReader for Vec<grenad::Reader<File>> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MergeableReader for Vec<(grenad::Reader<File>, grenad::Reader<File>)> {
|
impl MergeableReader for Vec<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
||||||
type Output = (grenad::Reader<File>, grenad::Reader<File>);
|
type Output = (grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>);
|
||||||
|
|
||||||
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
|
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
|
||||||
let mut m1 = MergerBuilder::new(merge_fn);
|
let mut m1 = MergerBuilder::new(merge_fn);
|
||||||
@ -125,7 +127,7 @@ impl<R: io::Read + io::Seek> MergerBuilder<R> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn finish(self, params: &GrenadParameters) -> Result<grenad::Reader<File>> {
|
fn finish(self, params: &GrenadParameters) -> Result<grenad::Reader<BufReader<File>>> {
|
||||||
let merger = self.0.build();
|
let merger = self.0.build();
|
||||||
let mut writer = create_writer(
|
let mut writer = create_writer(
|
||||||
params.chunk_compression_type,
|
params.chunk_compression_type,
|
||||||
@ -176,7 +178,7 @@ pub fn grenad_obkv_into_chunks<R: io::Read + io::Seek>(
|
|||||||
reader: grenad::Reader<R>,
|
reader: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
documents_chunk_size: usize,
|
documents_chunk_size: usize,
|
||||||
) -> Result<impl Iterator<Item = Result<grenad::Reader<File>>>> {
|
) -> Result<impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>>> {
|
||||||
let mut continue_reading = true;
|
let mut continue_reading = true;
|
||||||
let mut cursor = reader.into_cursor()?;
|
let mut cursor = reader.into_cursor()?;
|
||||||
|
|
||||||
|
@ -659,8 +659,10 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
new_documents_ids: self.new_documents_ids,
|
new_documents_ids: self.new_documents_ids,
|
||||||
replaced_documents_ids: self.replaced_documents_ids,
|
replaced_documents_ids: self.replaced_documents_ids,
|
||||||
documents_count: self.documents_count,
|
documents_count: self.documents_count,
|
||||||
original_documents,
|
original_documents: original_documents.into_inner().map_err(|err| err.into_error())?,
|
||||||
flattened_documents,
|
flattened_documents: flattened_documents
|
||||||
|
.into_inner()
|
||||||
|
.map_err(|err| err.into_error())?,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -779,8 +781,10 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
new_documents_ids: documents_ids,
|
new_documents_ids: documents_ids,
|
||||||
replaced_documents_ids: RoaringBitmap::default(),
|
replaced_documents_ids: RoaringBitmap::default(),
|
||||||
documents_count,
|
documents_count,
|
||||||
original_documents,
|
original_documents: original_documents.into_inner().map_err(|err| err.into_error())?,
|
||||||
flattened_documents,
|
flattened_documents: flattened_documents
|
||||||
|
.into_inner()
|
||||||
|
.map_err(|err| err.into_error())?,
|
||||||
};
|
};
|
||||||
|
|
||||||
let new_facets = output.compute_real_facets(wtxn, self.index)?;
|
let new_facets = output.compute_real_facets(wtxn, self.index)?;
|
||||||
|
@ -2,7 +2,7 @@ use std::borrow::Cow;
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::convert::TryInto;
|
use std::convert::TryInto;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::io::{self, BufReader};
|
||||||
|
|
||||||
use bytemuck::allocation::pod_collect_to_vec;
|
use bytemuck::allocation::pod_collect_to_vec;
|
||||||
use charabia::{Language, Script};
|
use charabia::{Language, Script};
|
||||||
@ -27,22 +27,22 @@ pub(crate) enum TypedChunk {
|
|||||||
FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
|
FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
|
||||||
FieldIdDocidFacetNumbers(grenad::Reader<CursorClonableMmap>),
|
FieldIdDocidFacetNumbers(grenad::Reader<CursorClonableMmap>),
|
||||||
Documents(grenad::Reader<CursorClonableMmap>),
|
Documents(grenad::Reader<CursorClonableMmap>),
|
||||||
FieldIdWordcountDocids(grenad::Reader<File>),
|
FieldIdWordcountDocids(grenad::Reader<BufReader<File>>),
|
||||||
NewDocumentsIds(RoaringBitmap),
|
NewDocumentsIds(RoaringBitmap),
|
||||||
WordDocids {
|
WordDocids {
|
||||||
word_docids_reader: grenad::Reader<File>,
|
word_docids_reader: grenad::Reader<BufReader<File>>,
|
||||||
exact_word_docids_reader: grenad::Reader<File>,
|
exact_word_docids_reader: grenad::Reader<BufReader<File>>,
|
||||||
},
|
},
|
||||||
WordPositionDocids(grenad::Reader<File>),
|
WordPositionDocids(grenad::Reader<BufReader<File>>),
|
||||||
WordFidDocids(grenad::Reader<File>),
|
WordFidDocids(grenad::Reader<BufReader<File>>),
|
||||||
WordPairProximityDocids(grenad::Reader<File>),
|
WordPairProximityDocids(grenad::Reader<BufReader<File>>),
|
||||||
FieldIdFacetStringDocids(grenad::Reader<File>),
|
FieldIdFacetStringDocids(grenad::Reader<BufReader<File>>),
|
||||||
FieldIdFacetNumberDocids(grenad::Reader<File>),
|
FieldIdFacetNumberDocids(grenad::Reader<BufReader<File>>),
|
||||||
FieldIdFacetExistsDocids(grenad::Reader<File>),
|
FieldIdFacetExistsDocids(grenad::Reader<BufReader<File>>),
|
||||||
FieldIdFacetIsNullDocids(grenad::Reader<File>),
|
FieldIdFacetIsNullDocids(grenad::Reader<BufReader<File>>),
|
||||||
FieldIdFacetIsEmptyDocids(grenad::Reader<File>),
|
FieldIdFacetIsEmptyDocids(grenad::Reader<BufReader<File>>),
|
||||||
GeoPoints(grenad::Reader<File>),
|
GeoPoints(grenad::Reader<BufReader<File>>),
|
||||||
VectorPoints(grenad::Reader<File>),
|
VectorPoints(grenad::Reader<BufReader<File>>),
|
||||||
ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>),
|
ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::io::BufReader;
|
use std::io::{BufReader, BufWriter};
|
||||||
|
|
||||||
use grenad::CompressionType;
|
use grenad::CompressionType;
|
||||||
use heed::types::ByteSlice;
|
use heed::types::ByteSlice;
|
||||||
@ -119,9 +119,9 @@ pub fn insert_into_database(
|
|||||||
pub fn write_into_lmdb_database_without_merging(
|
pub fn write_into_lmdb_database_without_merging(
|
||||||
wtxn: &mut heed::RwTxn,
|
wtxn: &mut heed::RwTxn,
|
||||||
database: heed::PolyDatabase,
|
database: heed::PolyDatabase,
|
||||||
writer: grenad::Writer<std::fs::File>,
|
writer: grenad::Writer<BufWriter<std::fs::File>>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let file = writer.into_inner()?;
|
let file = writer.into_inner()?.into_inner().map_err(|err| err.into_error())?;
|
||||||
let reader = grenad::Reader::new(BufReader::new(file))?;
|
let reader = grenad::Reader::new(BufReader::new(file))?;
|
||||||
if database.is_empty(wtxn)? {
|
if database.is_empty(wtxn)? {
|
||||||
let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
|
let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
|
||||||
|
@ -8,7 +8,7 @@ use Criterion::*;
|
|||||||
use crate::search::{self, EXTERNAL_DOCUMENTS_IDS};
|
use crate::search::{self, EXTERNAL_DOCUMENTS_IDS};
|
||||||
|
|
||||||
macro_rules! test_distinct {
|
macro_rules! test_distinct {
|
||||||
($func:ident, $distinct:ident, $exhaustive:ident, $limit:expr, $criteria:expr, $n_res:expr) => {
|
($func:ident, $distinct:ident, $exhaustive:ident, $limit:expr, $offset:expr, $criteria:expr, $n_res:expr) => {
|
||||||
#[test]
|
#[test]
|
||||||
fn $func() {
|
fn $func() {
|
||||||
let criteria = $criteria;
|
let criteria = $criteria;
|
||||||
@ -27,6 +27,7 @@ macro_rules! test_distinct {
|
|||||||
let mut search = Search::new(&rtxn, &index);
|
let mut search = Search::new(&rtxn, &index);
|
||||||
search.query(search::TEST_QUERY);
|
search.query(search::TEST_QUERY);
|
||||||
search.limit($limit);
|
search.limit($limit);
|
||||||
|
search.offset($offset);
|
||||||
search.exhaustive_number_hits($exhaustive);
|
search.exhaustive_number_hits($exhaustive);
|
||||||
|
|
||||||
search.terms_matching_strategy(TermsMatchingStrategy::default());
|
search.terms_matching_strategy(TermsMatchingStrategy::default());
|
||||||
@ -47,6 +48,7 @@ macro_rules! test_distinct {
|
|||||||
Some(d.id)
|
Some(d.id)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
.skip($offset)
|
||||||
.take($limit)
|
.take($limit)
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
@ -61,6 +63,7 @@ test_distinct!(
|
|||||||
tag,
|
tag,
|
||||||
true,
|
true,
|
||||||
1,
|
1,
|
||||||
|
0,
|
||||||
vec![Words, Typo, Proximity, Attribute, Exactness],
|
vec![Words, Typo, Proximity, Attribute, Exactness],
|
||||||
3
|
3
|
||||||
);
|
);
|
||||||
@ -69,6 +72,7 @@ test_distinct!(
|
|||||||
asc_desc_rank,
|
asc_desc_rank,
|
||||||
true,
|
true,
|
||||||
1,
|
1,
|
||||||
|
0,
|
||||||
vec![Words, Typo, Proximity, Attribute, Exactness],
|
vec![Words, Typo, Proximity, Attribute, Exactness],
|
||||||
7
|
7
|
||||||
);
|
);
|
||||||
@ -77,6 +81,7 @@ test_distinct!(
|
|||||||
asc_desc_rank,
|
asc_desc_rank,
|
||||||
true,
|
true,
|
||||||
0,
|
0,
|
||||||
|
0,
|
||||||
vec![Desc(S("attribute_rank")), Desc(S("exactness_rank")), Exactness, Typo],
|
vec![Desc(S("attribute_rank")), Desc(S("exactness_rank")), Exactness, Typo],
|
||||||
7
|
7
|
||||||
);
|
);
|
||||||
@ -86,6 +91,7 @@ test_distinct!(
|
|||||||
tag,
|
tag,
|
||||||
false,
|
false,
|
||||||
EXTERNAL_DOCUMENTS_IDS.len(),
|
EXTERNAL_DOCUMENTS_IDS.len(),
|
||||||
|
0,
|
||||||
vec![Words, Typo, Proximity, Attribute, Exactness],
|
vec![Words, Typo, Proximity, Attribute, Exactness],
|
||||||
3
|
3
|
||||||
);
|
);
|
||||||
@ -94,6 +100,7 @@ test_distinct!(
|
|||||||
asc_desc_rank,
|
asc_desc_rank,
|
||||||
false,
|
false,
|
||||||
EXTERNAL_DOCUMENTS_IDS.len(),
|
EXTERNAL_DOCUMENTS_IDS.len(),
|
||||||
|
0,
|
||||||
vec![Words, Typo, Proximity, Attribute, Exactness],
|
vec![Words, Typo, Proximity, Attribute, Exactness],
|
||||||
7
|
7
|
||||||
);
|
);
|
||||||
@ -102,6 +109,7 @@ test_distinct!(
|
|||||||
tag,
|
tag,
|
||||||
false,
|
false,
|
||||||
EXTERNAL_DOCUMENTS_IDS.len(),
|
EXTERNAL_DOCUMENTS_IDS.len(),
|
||||||
|
0,
|
||||||
vec![Words],
|
vec![Words],
|
||||||
3
|
3
|
||||||
);
|
);
|
||||||
@ -110,6 +118,7 @@ test_distinct!(
|
|||||||
asc_desc_rank,
|
asc_desc_rank,
|
||||||
false,
|
false,
|
||||||
EXTERNAL_DOCUMENTS_IDS.len(),
|
EXTERNAL_DOCUMENTS_IDS.len(),
|
||||||
|
0,
|
||||||
vec![Words],
|
vec![Words],
|
||||||
7
|
7
|
||||||
);
|
);
|
||||||
@ -118,6 +127,7 @@ test_distinct!(
|
|||||||
tag,
|
tag,
|
||||||
false,
|
false,
|
||||||
EXTERNAL_DOCUMENTS_IDS.len(),
|
EXTERNAL_DOCUMENTS_IDS.len(),
|
||||||
|
0,
|
||||||
vec![Words, Typo],
|
vec![Words, Typo],
|
||||||
3
|
3
|
||||||
);
|
);
|
||||||
@ -126,6 +136,7 @@ test_distinct!(
|
|||||||
asc_desc_rank,
|
asc_desc_rank,
|
||||||
false,
|
false,
|
||||||
EXTERNAL_DOCUMENTS_IDS.len(),
|
EXTERNAL_DOCUMENTS_IDS.len(),
|
||||||
|
0,
|
||||||
vec![Words, Typo],
|
vec![Words, Typo],
|
||||||
7
|
7
|
||||||
);
|
);
|
||||||
@ -134,6 +145,7 @@ test_distinct!(
|
|||||||
tag,
|
tag,
|
||||||
false,
|
false,
|
||||||
EXTERNAL_DOCUMENTS_IDS.len(),
|
EXTERNAL_DOCUMENTS_IDS.len(),
|
||||||
|
0,
|
||||||
vec![Words, Proximity],
|
vec![Words, Proximity],
|
||||||
3
|
3
|
||||||
);
|
);
|
||||||
@ -142,6 +154,7 @@ test_distinct!(
|
|||||||
asc_desc_rank,
|
asc_desc_rank,
|
||||||
false,
|
false,
|
||||||
EXTERNAL_DOCUMENTS_IDS.len(),
|
EXTERNAL_DOCUMENTS_IDS.len(),
|
||||||
|
0,
|
||||||
vec![Words, Proximity],
|
vec![Words, Proximity],
|
||||||
7
|
7
|
||||||
);
|
);
|
||||||
@ -150,6 +163,7 @@ test_distinct!(
|
|||||||
tag,
|
tag,
|
||||||
false,
|
false,
|
||||||
EXTERNAL_DOCUMENTS_IDS.len(),
|
EXTERNAL_DOCUMENTS_IDS.len(),
|
||||||
|
0,
|
||||||
vec![Words, Attribute],
|
vec![Words, Attribute],
|
||||||
3
|
3
|
||||||
);
|
);
|
||||||
@ -158,6 +172,7 @@ test_distinct!(
|
|||||||
asc_desc_rank,
|
asc_desc_rank,
|
||||||
false,
|
false,
|
||||||
EXTERNAL_DOCUMENTS_IDS.len(),
|
EXTERNAL_DOCUMENTS_IDS.len(),
|
||||||
|
0,
|
||||||
vec![Words, Attribute],
|
vec![Words, Attribute],
|
||||||
7
|
7
|
||||||
);
|
);
|
||||||
@ -166,6 +181,7 @@ test_distinct!(
|
|||||||
tag,
|
tag,
|
||||||
false,
|
false,
|
||||||
EXTERNAL_DOCUMENTS_IDS.len(),
|
EXTERNAL_DOCUMENTS_IDS.len(),
|
||||||
|
0,
|
||||||
vec![Words, Exactness],
|
vec![Words, Exactness],
|
||||||
3
|
3
|
||||||
);
|
);
|
||||||
@ -174,6 +190,47 @@ test_distinct!(
|
|||||||
asc_desc_rank,
|
asc_desc_rank,
|
||||||
false,
|
false,
|
||||||
EXTERNAL_DOCUMENTS_IDS.len(),
|
EXTERNAL_DOCUMENTS_IDS.len(),
|
||||||
|
0,
|
||||||
vec![Words, Exactness],
|
vec![Words, Exactness],
|
||||||
7
|
7
|
||||||
);
|
);
|
||||||
|
test_distinct!(
|
||||||
|
// testing: https://github.com/meilisearch/meilisearch/issues/4078
|
||||||
|
distinct_string_limit_and_offset,
|
||||||
|
tag,
|
||||||
|
false,
|
||||||
|
EXTERNAL_DOCUMENTS_IDS.len(),
|
||||||
|
1,
|
||||||
|
vec![],
|
||||||
|
2
|
||||||
|
);
|
||||||
|
test_distinct!(
|
||||||
|
// testing: https://github.com/meilisearch/meilisearch/issues/4078
|
||||||
|
exhaustive_distinct_string_limit_and_offset,
|
||||||
|
tag,
|
||||||
|
true,
|
||||||
|
1,
|
||||||
|
2,
|
||||||
|
vec![],
|
||||||
|
1
|
||||||
|
);
|
||||||
|
test_distinct!(
|
||||||
|
// testing: https://github.com/meilisearch/meilisearch/issues/4078
|
||||||
|
distinct_number_limit_and_offset,
|
||||||
|
asc_desc_rank,
|
||||||
|
false,
|
||||||
|
EXTERNAL_DOCUMENTS_IDS.len(),
|
||||||
|
2,
|
||||||
|
vec![],
|
||||||
|
5
|
||||||
|
);
|
||||||
|
test_distinct!(
|
||||||
|
// testing: https://github.com/meilisearch/meilisearch/issues/4078
|
||||||
|
exhaustive_distinct_number_limit_and_offset,
|
||||||
|
asc_desc_rank,
|
||||||
|
true,
|
||||||
|
2,
|
||||||
|
4,
|
||||||
|
vec![],
|
||||||
|
3
|
||||||
|
);
|
||||||
|
Loading…
Reference in New Issue
Block a user