mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-30 09:04:59 +08:00
Make the FieldsIdsMap serialization more stable by using a BTreeMap
This commit is contained in:
parent
9133f38138
commit
566a7c3039
@ -1,9 +1,9 @@
|
||||
use std::collections::{HashMap, BTreeMap};
|
||||
use std::collections::BTreeMap;
|
||||
use serde::{Serialize, Deserialize};
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct FieldsIdsMap {
|
||||
names_ids: HashMap<String, u8>,
|
||||
names_ids: BTreeMap<String, u8>,
|
||||
ids_names: BTreeMap<u8, String>,
|
||||
next_id: Option<u8>,
|
||||
}
|
||||
@ -11,7 +11,7 @@ pub struct FieldsIdsMap {
|
||||
impl FieldsIdsMap {
|
||||
pub fn new() -> FieldsIdsMap {
|
||||
FieldsIdsMap {
|
||||
names_ids: HashMap::new(),
|
||||
names_ids: BTreeMap::new(),
|
||||
ids_names: BTreeMap::new(),
|
||||
next_id: Some(0),
|
||||
}
|
||||
@ -66,6 +66,12 @@ impl FieldsIdsMap {
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for FieldsIdsMap {
|
||||
fn default() -> FieldsIdsMap {
|
||||
FieldsIdsMap::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
@ -1,26 +0,0 @@
|
||||
use std::borrow::Cow;
|
||||
use csv::{StringRecord, Writer, ReaderBuilder};
|
||||
|
||||
pub struct CsvStringRecordCodec;
|
||||
|
||||
impl heed::BytesDecode<'_> for CsvStringRecordCodec {
|
||||
type DItem = StringRecord;
|
||||
|
||||
fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> {
|
||||
let mut reader = ReaderBuilder::new()
|
||||
.has_headers(false)
|
||||
.buffer_capacity(bytes.len()) // we will just read this record
|
||||
.from_reader(bytes);
|
||||
reader.records().next()?.ok() // it return an Option of Result
|
||||
}
|
||||
}
|
||||
|
||||
impl heed::BytesEncode<'_> for CsvStringRecordCodec {
|
||||
type EItem = StringRecord;
|
||||
|
||||
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
|
||||
let mut writer = Writer::from_writer(Vec::new());
|
||||
writer.write_record(item).ok()?;
|
||||
writer.into_inner().ok().map(Cow::Owned)
|
||||
}
|
||||
}
|
@ -1,7 +1,6 @@
|
||||
mod beu32_str_codec;
|
||||
mod bo_roaring_bitmap_codec;
|
||||
mod cbo_roaring_bitmap_codec;
|
||||
mod csv_string_record_codec;
|
||||
mod obkv_codec;
|
||||
mod roaring_bitmap_codec;
|
||||
mod str_str_u8_codec;
|
||||
@ -9,7 +8,6 @@ mod str_str_u8_codec;
|
||||
pub use self::beu32_str_codec::BEU32StrCodec;
|
||||
pub use self::bo_roaring_bitmap_codec::BoRoaringBitmapCodec;
|
||||
pub use self::cbo_roaring_bitmap_codec::CboRoaringBitmapCodec;
|
||||
pub use self::csv_string_record_codec::CsvStringRecordCodec;
|
||||
pub use self::obkv_codec::ObkvCodec;
|
||||
pub use self::roaring_bitmap_codec::RoaringBitmapCodec;
|
||||
pub use self::str_str_u8_codec::StrStrU8Codec;
|
||||
|
24
src/index.rs
24
src/index.rs
@ -1,23 +1,23 @@
|
||||
use anyhow::Context;
|
||||
use csv::StringRecord;
|
||||
use heed::types::*;
|
||||
use heed::{PolyDatabase, Database};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::Search;
|
||||
use crate::{BEU32, DocumentId};
|
||||
use crate::fields_ids_map::FieldsIdsMap;
|
||||
use crate::{
|
||||
RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec,
|
||||
CsvStringRecordCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec,
|
||||
BoRoaringBitmapCodec, CboRoaringBitmapCodec,
|
||||
};
|
||||
|
||||
pub const WORDS_FST_KEY: &str = "words-fst";
|
||||
pub const HEADERS_KEY: &str = "headers";
|
||||
pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map";
|
||||
pub const DOCUMENTS_IDS_KEY: &str = "documents-ids";
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct Index {
|
||||
/// Contains many different types (e.g. the documents CSV headers).
|
||||
/// Contains many different types (e.g. the fields ids map).
|
||||
pub main: PolyDatabase,
|
||||
/// A word and all the documents ids containing the word.
|
||||
pub word_docids: Database<Str, RoaringBitmapCodec>,
|
||||
@ -25,7 +25,7 @@ pub struct Index {
|
||||
pub docid_word_positions: Database<BEU32StrCodec, BoRoaringBitmapCodec>,
|
||||
/// Maps the proximity between a pair of words with all the docids where this relation appears.
|
||||
pub word_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>,
|
||||
/// Maps the document id to the document as a CSV line.
|
||||
/// Maps the document id to the document as an obkv store.
|
||||
pub documents: Database<OwnedType<BEU32>, ObkvCodec>,
|
||||
}
|
||||
|
||||
@ -44,17 +44,17 @@ impl Index {
|
||||
Ok(self.main.get::<_, Str, RoaringBitmapCodec>(rtxn, DOCUMENTS_IDS_KEY)?)
|
||||
}
|
||||
|
||||
pub fn put_headers(&self, wtxn: &mut heed::RwTxn, headers: &StringRecord) -> heed::Result<()> {
|
||||
self.main.put::<_, Str, CsvStringRecordCodec>(wtxn, HEADERS_KEY, headers)
|
||||
pub fn put_fields_ids_map(&self, wtxn: &mut heed::RwTxn, map: &FieldsIdsMap) -> heed::Result<()> {
|
||||
self.main.put::<_, Str, SerdeJson<FieldsIdsMap>>(wtxn, FIELDS_IDS_MAP_KEY, map)
|
||||
}
|
||||
|
||||
pub fn headers(&self, rtxn: &heed::RoTxn) -> heed::Result<Option<StringRecord>> {
|
||||
self.main.get::<_, Str, CsvStringRecordCodec>(rtxn, HEADERS_KEY)
|
||||
pub fn fields_ids_map(&self, rtxn: &heed::RoTxn) -> heed::Result<Option<FieldsIdsMap>> {
|
||||
self.main.get::<_, Str, SerdeJson<FieldsIdsMap>>(rtxn, FIELDS_IDS_MAP_KEY)
|
||||
}
|
||||
|
||||
pub fn number_of_attributes(&self, rtxn: &heed::RoTxn) -> anyhow::Result<Option<usize>> {
|
||||
match self.headers(rtxn)? {
|
||||
Some(headers) => Ok(Some(headers.len())),
|
||||
pub fn number_of_fields(&self, rtxn: &heed::RoTxn) -> anyhow::Result<Option<usize>> {
|
||||
match self.fields_ids_map(rtxn)? {
|
||||
Some(map) => Ok(Some(map.len())),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
use std::borrow::Cow;
|
||||
|
||||
use anyhow::bail;
|
||||
use anyhow::{bail, ensure};
|
||||
use bstr::ByteSlice as _;
|
||||
use fst::IntoStreamer;
|
||||
use roaring::RoaringBitmap;
|
||||
@ -8,7 +8,7 @@ use roaring::RoaringBitmap;
|
||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||
|
||||
const WORDS_FST_KEY: &[u8] = crate::index::WORDS_FST_KEY.as_bytes();
|
||||
const HEADERS_KEY: &[u8] = crate::index::HEADERS_KEY.as_bytes();
|
||||
const FIELDS_IDS_MAP_KEY: &[u8] = crate::index::FIELDS_IDS_MAP_KEY.as_bytes();
|
||||
const DOCUMENTS_IDS_KEY: &[u8] = crate::index::DOCUMENTS_IDS_KEY.as_bytes();
|
||||
|
||||
pub fn main_merge(key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
|
||||
@ -25,8 +25,8 @@ pub fn main_merge(key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
|
||||
build.extend_stream(op.into_stream()).unwrap();
|
||||
Ok(build.into_inner().unwrap())
|
||||
},
|
||||
HEADERS_KEY => {
|
||||
assert!(values.windows(2).all(|vs| vs[0] == vs[1]));
|
||||
FIELDS_IDS_MAP_KEY => {
|
||||
ensure!(values.windows(2).all(|vs| vs[0] == vs[1]), "fields ids map doesn't match");
|
||||
Ok(values[0].to_vec())
|
||||
},
|
||||
DOCUMENTS_IDS_KEY => word_docids_merge(&[], values),
|
||||
|
@ -16,7 +16,8 @@ use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType};
|
||||
use roaring::RoaringBitmap;
|
||||
use tempfile::tempfile;
|
||||
|
||||
use crate::heed_codec::{CsvStringRecordCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec};
|
||||
use crate::fields_ids_map::FieldsIdsMap;
|
||||
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
|
||||
use crate::tokenizer::{simple_tokenizer, only_token};
|
||||
use crate::{SmallVec32, Position, DocumentId};
|
||||
|
||||
@ -30,7 +31,7 @@ const MAX_POSITION: usize = 1000;
|
||||
const MAX_ATTRIBUTES: usize = u32::max_value() as usize / MAX_POSITION;
|
||||
|
||||
const WORDS_FST_KEY: &[u8] = crate::index::WORDS_FST_KEY.as_bytes();
|
||||
const HEADERS_KEY: &[u8] = crate::index::HEADERS_KEY.as_bytes();
|
||||
const FIELDS_IDS_MAP_KEY: &[u8] = crate::index::FIELDS_IDS_MAP_KEY.as_bytes();
|
||||
const DOCUMENTS_IDS_KEY: &[u8] = crate::index::DOCUMENTS_IDS_KEY.as_bytes();
|
||||
|
||||
pub struct Readers {
|
||||
@ -182,10 +183,10 @@ impl Store {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_headers(&mut self, headers: &StringRecord) -> anyhow::Result<()> {
|
||||
let headers = CsvStringRecordCodec::bytes_encode(headers)
|
||||
.with_context(|| format!("could not encode csv record"))?;
|
||||
Ok(self.main_sorter.insert(HEADERS_KEY, headers)?)
|
||||
fn write_fields_ids_map(&mut self, map: &FieldsIdsMap) -> anyhow::Result<()> {
|
||||
let bytes = serde_json::to_vec(&map)?;
|
||||
self.main_sorter.insert(FIELDS_IDS_MAP_KEY, bytes)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_document(
|
||||
@ -320,7 +321,12 @@ impl Store {
|
||||
|
||||
// Write the headers into the store.
|
||||
let headers = rdr.headers()?;
|
||||
self.write_headers(&headers)?;
|
||||
|
||||
let mut fields_ids_map = FieldsIdsMap::new();
|
||||
for header in headers.iter() {
|
||||
fields_ids_map.insert(header).context("no more field id available")?;
|
||||
}
|
||||
self.write_fields_ids_map(&fields_ids_map)?;
|
||||
|
||||
let mut before = Instant::now();
|
||||
let mut document_id: usize = base_document_id;
|
||||
|
@ -20,8 +20,8 @@ pub use self::index::Index;
|
||||
pub use self::search::{Search, SearchResult};
|
||||
pub use self::update_store::UpdateStore;
|
||||
pub use self::heed_codec::{
|
||||
RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec,
|
||||
CsvStringRecordCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec,
|
||||
RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec,
|
||||
ObkvCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec,
|
||||
};
|
||||
|
||||
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
|
||||
|
@ -1,8 +1,10 @@
|
||||
use std::collections::HashMap;
|
||||
use std::io::{self, BufRead};
|
||||
use std::iter::once;
|
||||
use std::path::PathBuf;
|
||||
use std::time::Instant;
|
||||
|
||||
use anyhow::Context;
|
||||
use heed::EnvOpenOptions;
|
||||
use log::debug;
|
||||
use structopt::StructOpt;
|
||||
@ -59,18 +61,22 @@ pub fn run(opt: Opt) -> anyhow::Result<()> {
|
||||
let query = result?;
|
||||
let result = index.search(&rtxn).query(query).execute().unwrap();
|
||||
|
||||
let headers = match index.headers(&rtxn)? {
|
||||
Some(headers) => headers,
|
||||
None => return Ok(()),
|
||||
};
|
||||
let mut stdout = io::stdout();
|
||||
let fields_ids_map = index.fields_ids_map(&rtxn)?.unwrap_or_default();
|
||||
let documents = index.documents(&rtxn, result.documents_ids.iter().cloned())?;
|
||||
|
||||
let mut wtr = csv::Writer::from_writer(io::stdout());
|
||||
wtr.write_record(&headers)?;
|
||||
for (_id, record) in documents {
|
||||
wtr.write_record(record.iter().map(|(_, v)| v))?;
|
||||
let document: anyhow::Result<HashMap<_, _>> = record.iter()
|
||||
.map(|(k, v)| {
|
||||
let key = fields_ids_map.name(k).context("field id not found")?;
|
||||
let val = std::str::from_utf8(v)?;
|
||||
Ok((key, val))
|
||||
})
|
||||
.collect();
|
||||
|
||||
let document = document?;
|
||||
serde_json::to_writer(&mut stdout, &document)?;
|
||||
}
|
||||
wtr.flush()?;
|
||||
|
||||
debug!("Took {:.02?} to find {} documents", before.elapsed(), result.documents_ids.len());
|
||||
}
|
||||
|
@ -382,11 +382,12 @@ pub fn run(opt: Opt) -> anyhow::Result<()> {
|
||||
let SearchResult { found_words, documents_ids } = search.execute().unwrap();
|
||||
|
||||
let mut documents = Vec::new();
|
||||
if let Some(headers) = index.headers(&rtxn).unwrap() {
|
||||
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap().unwrap_or_default();
|
||||
|
||||
for (_id, record) in index.documents(&rtxn, documents_ids).unwrap() {
|
||||
let mut record = record.iter()
|
||||
.map(|(key_id, value)| {
|
||||
let key = headers[key_id as usize].to_owned();
|
||||
let key = fields_ids_map.name(key_id).unwrap().to_owned();
|
||||
let value = std::str::from_utf8(value).unwrap().to_owned();
|
||||
(key, value)
|
||||
})
|
||||
@ -398,7 +399,6 @@ pub fn run(opt: Opt) -> anyhow::Result<()> {
|
||||
|
||||
documents.push(record);
|
||||
}
|
||||
}
|
||||
|
||||
Response::builder()
|
||||
.header("Content-Type", "application/json")
|
||||
|
Loading…
Reference in New Issue
Block a user