mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-30 09:04:59 +08:00
Directly write to LMDB without intermediate final MTBL
This commit is contained in:
parent
2174042994
commit
1df1f88fe1
@ -1,5 +1,5 @@
|
|||||||
use std::collections::BTreeSet;
|
use std::collections::BTreeSet;
|
||||||
use std::convert::{TryInto, TryFrom};
|
use std::convert::TryFrom;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||||
@ -15,7 +15,7 @@ use roaring::RoaringBitmap;
|
|||||||
use structopt::StructOpt;
|
use structopt::StructOpt;
|
||||||
|
|
||||||
use mega_mini_indexer::alphanumeric_tokens;
|
use mega_mini_indexer::alphanumeric_tokens;
|
||||||
use mega_mini_indexer::{FastMap4, SmallVec32, BEU32, Index, DocumentId};
|
use mega_mini_indexer::{FastMap4, SmallVec32, Index, DocumentId};
|
||||||
|
|
||||||
#[cfg(target_os = "linux")]
|
#[cfg(target_os = "linux")]
|
||||||
#[global_allocator]
|
#[global_allocator]
|
||||||
@ -136,7 +136,9 @@ impl MtblKvStore {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn from_many(stores: Vec<MtblKvStore>) -> anyhow::Result<MtblKvStore> {
|
fn from_many<F>(stores: Vec<MtblKvStore>, mut f: F) -> anyhow::Result<()>
|
||||||
|
where F: FnMut(&[u8], &[u8]) -> anyhow::Result<()>
|
||||||
|
{
|
||||||
eprintln!("{:?}: Merging {} MTBL stores...", rayon::current_thread_index(), stores.len());
|
eprintln!("{:?}: Merging {} MTBL stores...", rayon::current_thread_index(), stores.len());
|
||||||
|
|
||||||
let mmaps: Vec<_> = stores.iter().flat_map(|m| {
|
let mmaps: Vec<_> = stores.iter().flat_map(|m| {
|
||||||
@ -147,21 +149,16 @@ impl MtblKvStore {
|
|||||||
Reader::new(&mmap, ReaderOptions::default()).unwrap()
|
Reader::new(&mmap, ReaderOptions::default()).unwrap()
|
||||||
}).collect();
|
}).collect();
|
||||||
|
|
||||||
let outfile = tempfile::tempfile()?;
|
|
||||||
let mut out = Writer::new(outfile, None)?;
|
|
||||||
|
|
||||||
let opt = MergerOptions { merge: MtblKvStore::merge };
|
let opt = MergerOptions { merge: MtblKvStore::merge };
|
||||||
let mut merger = Merger::new(sources, opt);
|
let mut merger = Merger::new(sources, opt);
|
||||||
|
|
||||||
let mut iter = merger.iter();
|
let mut iter = merger.iter();
|
||||||
while let Some((k, v)) = iter.next() {
|
while let Some((k, v)) = iter.next() {
|
||||||
out.add(k, v).unwrap();
|
(f)(k, v)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let out = out.into_inner()?;
|
|
||||||
|
|
||||||
eprintln!("{:?}: MTBL stores merged!", rayon::current_thread_index());
|
eprintln!("{:?}: MTBL stores merged!", rayon::current_thread_index());
|
||||||
Ok(MtblKvStore(Some(out)))
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -226,46 +223,32 @@ fn index_csv(mut rdr: csv::Reader<File>) -> anyhow::Result<MtblKvStore> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// TODO merge with the previous values
|
// TODO merge with the previous values
|
||||||
fn writer(wtxn: &mut heed::RwTxn, index: Index, mtbl_store: MtblKvStore) -> anyhow::Result<usize> {
|
fn writer(wtxn: &mut heed::RwTxn, index: &Index, key: &[u8], val: &[u8]) -> anyhow::Result<()> {
|
||||||
let mtbl_store = match mtbl_store.0 {
|
if key == b"\0words-fst" {
|
||||||
Some(store) => unsafe { memmap::Mmap::map(&store)? },
|
// Write the words fst
|
||||||
None => return Ok(0),
|
index.main.put::<_, Str, ByteSlice>(wtxn, "words-fst", val)?;
|
||||||
};
|
}
|
||||||
let mtbl_store = Reader::new(&mtbl_store, ReaderOptions::default()).unwrap();
|
else if key == b"\0headers" {
|
||||||
|
// Write the headers
|
||||||
// Write the words fst
|
index.main.put::<_, Str, ByteSlice>(wtxn, "headers", val)?;
|
||||||
let fst = mtbl_store.get(b"\0words-fst").unwrap();
|
}
|
||||||
let fst = fst::Set::new(fst)?;
|
else if key.starts_with(&[1]) {
|
||||||
index.main.put::<_, Str, ByteSlice>(wtxn, "words-fst", &fst.as_fst().as_bytes())?;
|
// Write the postings lists
|
||||||
|
index.postings_ids.as_polymorph()
|
||||||
// Write and merge the headers
|
.put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?;
|
||||||
let headers = mtbl_store.get(b"\0headers").unwrap();
|
}
|
||||||
index.main.put::<_, Str, ByteSlice>(wtxn, "headers", headers.as_ref())?;
|
else if key.starts_with(&[2]) {
|
||||||
|
// Write the prefix postings lists
|
||||||
// Write and merge the postings lists
|
index.prefix_postings_ids.as_polymorph()
|
||||||
let mut iter = mtbl_store.iter_prefix(&[1]).unwrap();
|
.put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?;
|
||||||
while let Some((word, postings)) = iter.next() {
|
}
|
||||||
let word = std::str::from_utf8(&word[1..]).unwrap();
|
else if key.starts_with(&[3]) {
|
||||||
index.postings_ids.put(wtxn, &word, &postings)?;
|
// Write the documents
|
||||||
|
index.documents.as_polymorph()
|
||||||
|
.put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Write and merge the prefix postings lists
|
Ok(())
|
||||||
let mut iter = mtbl_store.iter_prefix(&[2]).unwrap();
|
|
||||||
while let Some((word, postings)) = iter.next() {
|
|
||||||
let word = std::str::from_utf8(&word[1..]).unwrap();
|
|
||||||
index.prefix_postings_ids.put(wtxn, &word, &postings)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Write the documents
|
|
||||||
let mut count = 0;
|
|
||||||
let mut iter = mtbl_store.iter_prefix(&[3]).unwrap();
|
|
||||||
while let Some((id_bytes, content)) = iter.next() {
|
|
||||||
let id = id_bytes[1..].try_into().map(u32::from_be_bytes).unwrap();
|
|
||||||
index.documents.put(wtxn, &BEU32::new(id), &content)?;
|
|
||||||
count += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(count)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn main() -> anyhow::Result<()> {
|
fn main() -> anyhow::Result<()> {
|
||||||
@ -291,18 +274,24 @@ fn main() -> anyhow::Result<()> {
|
|||||||
})
|
})
|
||||||
.collect::<Result<_, _>>()?;
|
.collect::<Result<_, _>>()?;
|
||||||
|
|
||||||
while stores.len() > 1 {
|
while stores.len() > 3 {
|
||||||
|
let chunk_size = (stores.len() / rayon::current_num_threads()).max(2);
|
||||||
let s = std::mem::take(&mut stores);
|
let s = std::mem::take(&mut stores);
|
||||||
stores = s.into_par_iter().chunks(3)
|
stores = s.into_par_iter().chunks(chunk_size)
|
||||||
.map(MtblKvStore::from_many)
|
.map(|v| {
|
||||||
|
let outfile = tempfile::tempfile()?;
|
||||||
|
let mut out = Writer::new(outfile, None)?;
|
||||||
|
MtblKvStore::from_many(v, |k, v| Ok(out.add(k, v).unwrap()))?;
|
||||||
|
let out = out.into_inner()?;
|
||||||
|
Ok(MtblKvStore(Some(out))) as anyhow::Result<_>
|
||||||
|
})
|
||||||
.collect::<Result<_, _>>()?;
|
.collect::<Result<_, _>>()?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let mtbl_store = stores.pop().unwrap_or_default();
|
|
||||||
|
|
||||||
eprintln!("We are writing into LMDB...");
|
eprintln!("We are writing into LMDB...");
|
||||||
let mut wtxn = env.write_txn()?;
|
let mut wtxn = env.write_txn()?;
|
||||||
let count = writer(&mut wtxn, index, mtbl_store)?;
|
MtblKvStore::from_many(stores, |k, v| writer(&mut wtxn, &index, k, v))?;
|
||||||
|
let count = index.documents.len(&wtxn)?;
|
||||||
wtxn.commit()?;
|
wtxn.commit()?;
|
||||||
eprintln!("Wrote {} documents into LMDB", count);
|
eprintln!("Wrote {} documents into LMDB", count);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user