mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-26 20:15:07 +08:00
Make the FieldsIdsMap serialization more stable by using a BTreeMap
This commit is contained in:
parent
9133f38138
commit
566a7c3039
@ -1,9 +1,9 @@
|
|||||||
use std::collections::{HashMap, BTreeMap};
|
use std::collections::BTreeMap;
|
||||||
use serde::{Serialize, Deserialize};
|
use serde::{Serialize, Deserialize};
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct FieldsIdsMap {
|
pub struct FieldsIdsMap {
|
||||||
names_ids: HashMap<String, u8>,
|
names_ids: BTreeMap<String, u8>,
|
||||||
ids_names: BTreeMap<u8, String>,
|
ids_names: BTreeMap<u8, String>,
|
||||||
next_id: Option<u8>,
|
next_id: Option<u8>,
|
||||||
}
|
}
|
||||||
@ -11,7 +11,7 @@ pub struct FieldsIdsMap {
|
|||||||
impl FieldsIdsMap {
|
impl FieldsIdsMap {
|
||||||
pub fn new() -> FieldsIdsMap {
|
pub fn new() -> FieldsIdsMap {
|
||||||
FieldsIdsMap {
|
FieldsIdsMap {
|
||||||
names_ids: HashMap::new(),
|
names_ids: BTreeMap::new(),
|
||||||
ids_names: BTreeMap::new(),
|
ids_names: BTreeMap::new(),
|
||||||
next_id: Some(0),
|
next_id: Some(0),
|
||||||
}
|
}
|
||||||
@ -66,6 +66,12 @@ impl FieldsIdsMap {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Default for FieldsIdsMap {
|
||||||
|
fn default() -> FieldsIdsMap {
|
||||||
|
FieldsIdsMap::new()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
@ -1,26 +0,0 @@
|
|||||||
use std::borrow::Cow;
|
|
||||||
use csv::{StringRecord, Writer, ReaderBuilder};
|
|
||||||
|
|
||||||
pub struct CsvStringRecordCodec;
|
|
||||||
|
|
||||||
impl heed::BytesDecode<'_> for CsvStringRecordCodec {
|
|
||||||
type DItem = StringRecord;
|
|
||||||
|
|
||||||
fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> {
|
|
||||||
let mut reader = ReaderBuilder::new()
|
|
||||||
.has_headers(false)
|
|
||||||
.buffer_capacity(bytes.len()) // we will just read this record
|
|
||||||
.from_reader(bytes);
|
|
||||||
reader.records().next()?.ok() // it return an Option of Result
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl heed::BytesEncode<'_> for CsvStringRecordCodec {
|
|
||||||
type EItem = StringRecord;
|
|
||||||
|
|
||||||
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
|
|
||||||
let mut writer = Writer::from_writer(Vec::new());
|
|
||||||
writer.write_record(item).ok()?;
|
|
||||||
writer.into_inner().ok().map(Cow::Owned)
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,7 +1,6 @@
|
|||||||
mod beu32_str_codec;
|
mod beu32_str_codec;
|
||||||
mod bo_roaring_bitmap_codec;
|
mod bo_roaring_bitmap_codec;
|
||||||
mod cbo_roaring_bitmap_codec;
|
mod cbo_roaring_bitmap_codec;
|
||||||
mod csv_string_record_codec;
|
|
||||||
mod obkv_codec;
|
mod obkv_codec;
|
||||||
mod roaring_bitmap_codec;
|
mod roaring_bitmap_codec;
|
||||||
mod str_str_u8_codec;
|
mod str_str_u8_codec;
|
||||||
@ -9,7 +8,6 @@ mod str_str_u8_codec;
|
|||||||
pub use self::beu32_str_codec::BEU32StrCodec;
|
pub use self::beu32_str_codec::BEU32StrCodec;
|
||||||
pub use self::bo_roaring_bitmap_codec::BoRoaringBitmapCodec;
|
pub use self::bo_roaring_bitmap_codec::BoRoaringBitmapCodec;
|
||||||
pub use self::cbo_roaring_bitmap_codec::CboRoaringBitmapCodec;
|
pub use self::cbo_roaring_bitmap_codec::CboRoaringBitmapCodec;
|
||||||
pub use self::csv_string_record_codec::CsvStringRecordCodec;
|
|
||||||
pub use self::obkv_codec::ObkvCodec;
|
pub use self::obkv_codec::ObkvCodec;
|
||||||
pub use self::roaring_bitmap_codec::RoaringBitmapCodec;
|
pub use self::roaring_bitmap_codec::RoaringBitmapCodec;
|
||||||
pub use self::str_str_u8_codec::StrStrU8Codec;
|
pub use self::str_str_u8_codec::StrStrU8Codec;
|
||||||
|
24
src/index.rs
24
src/index.rs
@ -1,23 +1,23 @@
|
|||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use csv::StringRecord;
|
|
||||||
use heed::types::*;
|
use heed::types::*;
|
||||||
use heed::{PolyDatabase, Database};
|
use heed::{PolyDatabase, Database};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use crate::Search;
|
use crate::Search;
|
||||||
use crate::{BEU32, DocumentId};
|
use crate::{BEU32, DocumentId};
|
||||||
|
use crate::fields_ids_map::FieldsIdsMap;
|
||||||
use crate::{
|
use crate::{
|
||||||
RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec,
|
RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec,
|
||||||
CsvStringRecordCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec,
|
BoRoaringBitmapCodec, CboRoaringBitmapCodec,
|
||||||
};
|
};
|
||||||
|
|
||||||
pub const WORDS_FST_KEY: &str = "words-fst";
|
pub const WORDS_FST_KEY: &str = "words-fst";
|
||||||
pub const HEADERS_KEY: &str = "headers";
|
pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map";
|
||||||
pub const DOCUMENTS_IDS_KEY: &str = "documents-ids";
|
pub const DOCUMENTS_IDS_KEY: &str = "documents-ids";
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct Index {
|
pub struct Index {
|
||||||
/// Contains many different types (e.g. the documents CSV headers).
|
/// Contains many different types (e.g. the fields ids map).
|
||||||
pub main: PolyDatabase,
|
pub main: PolyDatabase,
|
||||||
/// A word and all the documents ids containing the word.
|
/// A word and all the documents ids containing the word.
|
||||||
pub word_docids: Database<Str, RoaringBitmapCodec>,
|
pub word_docids: Database<Str, RoaringBitmapCodec>,
|
||||||
@ -25,7 +25,7 @@ pub struct Index {
|
|||||||
pub docid_word_positions: Database<BEU32StrCodec, BoRoaringBitmapCodec>,
|
pub docid_word_positions: Database<BEU32StrCodec, BoRoaringBitmapCodec>,
|
||||||
/// Maps the proximity between a pair of words with all the docids where this relation appears.
|
/// Maps the proximity between a pair of words with all the docids where this relation appears.
|
||||||
pub word_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>,
|
pub word_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>,
|
||||||
/// Maps the document id to the document as a CSV line.
|
/// Maps the document id to the document as an obkv store.
|
||||||
pub documents: Database<OwnedType<BEU32>, ObkvCodec>,
|
pub documents: Database<OwnedType<BEU32>, ObkvCodec>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -44,17 +44,17 @@ impl Index {
|
|||||||
Ok(self.main.get::<_, Str, RoaringBitmapCodec>(rtxn, DOCUMENTS_IDS_KEY)?)
|
Ok(self.main.get::<_, Str, RoaringBitmapCodec>(rtxn, DOCUMENTS_IDS_KEY)?)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn put_headers(&self, wtxn: &mut heed::RwTxn, headers: &StringRecord) -> heed::Result<()> {
|
pub fn put_fields_ids_map(&self, wtxn: &mut heed::RwTxn, map: &FieldsIdsMap) -> heed::Result<()> {
|
||||||
self.main.put::<_, Str, CsvStringRecordCodec>(wtxn, HEADERS_KEY, headers)
|
self.main.put::<_, Str, SerdeJson<FieldsIdsMap>>(wtxn, FIELDS_IDS_MAP_KEY, map)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn headers(&self, rtxn: &heed::RoTxn) -> heed::Result<Option<StringRecord>> {
|
pub fn fields_ids_map(&self, rtxn: &heed::RoTxn) -> heed::Result<Option<FieldsIdsMap>> {
|
||||||
self.main.get::<_, Str, CsvStringRecordCodec>(rtxn, HEADERS_KEY)
|
self.main.get::<_, Str, SerdeJson<FieldsIdsMap>>(rtxn, FIELDS_IDS_MAP_KEY)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn number_of_attributes(&self, rtxn: &heed::RoTxn) -> anyhow::Result<Option<usize>> {
|
pub fn number_of_fields(&self, rtxn: &heed::RoTxn) -> anyhow::Result<Option<usize>> {
|
||||||
match self.headers(rtxn)? {
|
match self.fields_ids_map(rtxn)? {
|
||||||
Some(headers) => Ok(Some(headers.len())),
|
Some(map) => Ok(Some(map.len())),
|
||||||
None => Ok(None),
|
None => Ok(None),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
|
|
||||||
use anyhow::bail;
|
use anyhow::{bail, ensure};
|
||||||
use bstr::ByteSlice as _;
|
use bstr::ByteSlice as _;
|
||||||
use fst::IntoStreamer;
|
use fst::IntoStreamer;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
@ -8,7 +8,7 @@ use roaring::RoaringBitmap;
|
|||||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||||
|
|
||||||
const WORDS_FST_KEY: &[u8] = crate::index::WORDS_FST_KEY.as_bytes();
|
const WORDS_FST_KEY: &[u8] = crate::index::WORDS_FST_KEY.as_bytes();
|
||||||
const HEADERS_KEY: &[u8] = crate::index::HEADERS_KEY.as_bytes();
|
const FIELDS_IDS_MAP_KEY: &[u8] = crate::index::FIELDS_IDS_MAP_KEY.as_bytes();
|
||||||
const DOCUMENTS_IDS_KEY: &[u8] = crate::index::DOCUMENTS_IDS_KEY.as_bytes();
|
const DOCUMENTS_IDS_KEY: &[u8] = crate::index::DOCUMENTS_IDS_KEY.as_bytes();
|
||||||
|
|
||||||
pub fn main_merge(key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
|
pub fn main_merge(key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
|
||||||
@ -25,8 +25,8 @@ pub fn main_merge(key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
|
|||||||
build.extend_stream(op.into_stream()).unwrap();
|
build.extend_stream(op.into_stream()).unwrap();
|
||||||
Ok(build.into_inner().unwrap())
|
Ok(build.into_inner().unwrap())
|
||||||
},
|
},
|
||||||
HEADERS_KEY => {
|
FIELDS_IDS_MAP_KEY => {
|
||||||
assert!(values.windows(2).all(|vs| vs[0] == vs[1]));
|
ensure!(values.windows(2).all(|vs| vs[0] == vs[1]), "fields ids map doesn't match");
|
||||||
Ok(values[0].to_vec())
|
Ok(values[0].to_vec())
|
||||||
},
|
},
|
||||||
DOCUMENTS_IDS_KEY => word_docids_merge(&[], values),
|
DOCUMENTS_IDS_KEY => word_docids_merge(&[], values),
|
||||||
|
@ -16,7 +16,8 @@ use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType};
|
|||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use tempfile::tempfile;
|
use tempfile::tempfile;
|
||||||
|
|
||||||
use crate::heed_codec::{CsvStringRecordCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec};
|
use crate::fields_ids_map::FieldsIdsMap;
|
||||||
|
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
|
||||||
use crate::tokenizer::{simple_tokenizer, only_token};
|
use crate::tokenizer::{simple_tokenizer, only_token};
|
||||||
use crate::{SmallVec32, Position, DocumentId};
|
use crate::{SmallVec32, Position, DocumentId};
|
||||||
|
|
||||||
@ -30,7 +31,7 @@ const MAX_POSITION: usize = 1000;
|
|||||||
const MAX_ATTRIBUTES: usize = u32::max_value() as usize / MAX_POSITION;
|
const MAX_ATTRIBUTES: usize = u32::max_value() as usize / MAX_POSITION;
|
||||||
|
|
||||||
const WORDS_FST_KEY: &[u8] = crate::index::WORDS_FST_KEY.as_bytes();
|
const WORDS_FST_KEY: &[u8] = crate::index::WORDS_FST_KEY.as_bytes();
|
||||||
const HEADERS_KEY: &[u8] = crate::index::HEADERS_KEY.as_bytes();
|
const FIELDS_IDS_MAP_KEY: &[u8] = crate::index::FIELDS_IDS_MAP_KEY.as_bytes();
|
||||||
const DOCUMENTS_IDS_KEY: &[u8] = crate::index::DOCUMENTS_IDS_KEY.as_bytes();
|
const DOCUMENTS_IDS_KEY: &[u8] = crate::index::DOCUMENTS_IDS_KEY.as_bytes();
|
||||||
|
|
||||||
pub struct Readers {
|
pub struct Readers {
|
||||||
@ -182,10 +183,10 @@ impl Store {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn write_headers(&mut self, headers: &StringRecord) -> anyhow::Result<()> {
|
fn write_fields_ids_map(&mut self, map: &FieldsIdsMap) -> anyhow::Result<()> {
|
||||||
let headers = CsvStringRecordCodec::bytes_encode(headers)
|
let bytes = serde_json::to_vec(&map)?;
|
||||||
.with_context(|| format!("could not encode csv record"))?;
|
self.main_sorter.insert(FIELDS_IDS_MAP_KEY, bytes)?;
|
||||||
Ok(self.main_sorter.insert(HEADERS_KEY, headers)?)
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn write_document(
|
fn write_document(
|
||||||
@ -320,7 +321,12 @@ impl Store {
|
|||||||
|
|
||||||
// Write the headers into the store.
|
// Write the headers into the store.
|
||||||
let headers = rdr.headers()?;
|
let headers = rdr.headers()?;
|
||||||
self.write_headers(&headers)?;
|
|
||||||
|
let mut fields_ids_map = FieldsIdsMap::new();
|
||||||
|
for header in headers.iter() {
|
||||||
|
fields_ids_map.insert(header).context("no more field id available")?;
|
||||||
|
}
|
||||||
|
self.write_fields_ids_map(&fields_ids_map)?;
|
||||||
|
|
||||||
let mut before = Instant::now();
|
let mut before = Instant::now();
|
||||||
let mut document_id: usize = base_document_id;
|
let mut document_id: usize = base_document_id;
|
||||||
|
@ -20,8 +20,8 @@ pub use self::index::Index;
|
|||||||
pub use self::search::{Search, SearchResult};
|
pub use self::search::{Search, SearchResult};
|
||||||
pub use self::update_store::UpdateStore;
|
pub use self::update_store::UpdateStore;
|
||||||
pub use self::heed_codec::{
|
pub use self::heed_codec::{
|
||||||
RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec,
|
RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec,
|
||||||
CsvStringRecordCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec,
|
ObkvCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec,
|
||||||
};
|
};
|
||||||
|
|
||||||
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
|
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
|
||||||
|
@ -1,8 +1,10 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
use std::io::{self, BufRead};
|
use std::io::{self, BufRead};
|
||||||
use std::iter::once;
|
use std::iter::once;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
|
|
||||||
|
use anyhow::Context;
|
||||||
use heed::EnvOpenOptions;
|
use heed::EnvOpenOptions;
|
||||||
use log::debug;
|
use log::debug;
|
||||||
use structopt::StructOpt;
|
use structopt::StructOpt;
|
||||||
@ -59,18 +61,22 @@ pub fn run(opt: Opt) -> anyhow::Result<()> {
|
|||||||
let query = result?;
|
let query = result?;
|
||||||
let result = index.search(&rtxn).query(query).execute().unwrap();
|
let result = index.search(&rtxn).query(query).execute().unwrap();
|
||||||
|
|
||||||
let headers = match index.headers(&rtxn)? {
|
let mut stdout = io::stdout();
|
||||||
Some(headers) => headers,
|
let fields_ids_map = index.fields_ids_map(&rtxn)?.unwrap_or_default();
|
||||||
None => return Ok(()),
|
|
||||||
};
|
|
||||||
let documents = index.documents(&rtxn, result.documents_ids.iter().cloned())?;
|
let documents = index.documents(&rtxn, result.documents_ids.iter().cloned())?;
|
||||||
|
|
||||||
let mut wtr = csv::Writer::from_writer(io::stdout());
|
|
||||||
wtr.write_record(&headers)?;
|
|
||||||
for (_id, record) in documents {
|
for (_id, record) in documents {
|
||||||
wtr.write_record(record.iter().map(|(_, v)| v))?;
|
let document: anyhow::Result<HashMap<_, _>> = record.iter()
|
||||||
|
.map(|(k, v)| {
|
||||||
|
let key = fields_ids_map.name(k).context("field id not found")?;
|
||||||
|
let val = std::str::from_utf8(v)?;
|
||||||
|
Ok((key, val))
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let document = document?;
|
||||||
|
serde_json::to_writer(&mut stdout, &document)?;
|
||||||
}
|
}
|
||||||
wtr.flush()?;
|
|
||||||
|
|
||||||
debug!("Took {:.02?} to find {} documents", before.elapsed(), result.documents_ids.len());
|
debug!("Took {:.02?} to find {} documents", before.elapsed(), result.documents_ids.len());
|
||||||
}
|
}
|
||||||
|
@ -382,22 +382,22 @@ pub fn run(opt: Opt) -> anyhow::Result<()> {
|
|||||||
let SearchResult { found_words, documents_ids } = search.execute().unwrap();
|
let SearchResult { found_words, documents_ids } = search.execute().unwrap();
|
||||||
|
|
||||||
let mut documents = Vec::new();
|
let mut documents = Vec::new();
|
||||||
if let Some(headers) = index.headers(&rtxn).unwrap() {
|
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap().unwrap_or_default();
|
||||||
for (_id, record) in index.documents(&rtxn, documents_ids).unwrap() {
|
|
||||||
let mut record = record.iter()
|
|
||||||
.map(|(key_id, value)| {
|
|
||||||
let key = headers[key_id as usize].to_owned();
|
|
||||||
let value = std::str::from_utf8(value).unwrap().to_owned();
|
|
||||||
(key, value)
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
if !disable_highlighting {
|
for (_id, record) in index.documents(&rtxn, documents_ids).unwrap() {
|
||||||
highlight_record(&mut record, &found_words);
|
let mut record = record.iter()
|
||||||
}
|
.map(|(key_id, value)| {
|
||||||
|
let key = fields_ids_map.name(key_id).unwrap().to_owned();
|
||||||
|
let value = std::str::from_utf8(value).unwrap().to_owned();
|
||||||
|
(key, value)
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
documents.push(record);
|
if !disable_highlighting {
|
||||||
|
highlight_record(&mut record, &found_words);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
documents.push(record);
|
||||||
}
|
}
|
||||||
|
|
||||||
Response::builder()
|
Response::builder()
|
||||||
|
Loading…
Reference in New Issue
Block a user