Directly use a writer for the docid word positions

This commit is contained in:
Clément Renault 2020-10-04 18:17:53 +02:00
parent 67577a3760
commit 99705deb7d
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4

View File

@ -1,4 +1,4 @@
use std::collections::HashMap; use std::collections::{BTreeMap, HashMap};
use std::convert::TryFrom; use std::convert::TryFrom;
use std::fs::File; use std::fs::File;
use std::io::{self, Read, Write}; use std::io::{self, Read, Write};
@ -86,7 +86,7 @@ struct IndexerOpt {
max_nb_chunks: Option<usize>, max_nb_chunks: Option<usize>,
/// MTBL max memory in bytes. /// MTBL max memory in bytes.
#[structopt(long, default_value = "346030080")] // 330 MB #[structopt(long, default_value = "440401920")] // 420 MB
max_memory: usize, max_memory: usize,
/// Size of the linked hash map cache when indexing. /// Size of the linked hash map cache when indexing.
@ -198,6 +198,14 @@ fn compute_words_pair_proximities(
type MergeFn = fn(&[u8], &[Vec<u8>]) -> Result<Vec<u8>, ()>; type MergeFn = fn(&[u8], &[Vec<u8>]) -> Result<Vec<u8>, ()>;
struct Readers {
main: Reader<Mmap>,
word_docids: Reader<Mmap>,
docid_word_positions: Reader<Mmap>,
words_pairs_proximities_docids: Reader<Mmap>,
documents: Reader<Mmap>,
}
struct Store { struct Store {
word_docids: LinkedHashMap<SmallVec32<u8>, RoaringBitmap>, word_docids: LinkedHashMap<SmallVec32<u8>, RoaringBitmap>,
word_docids_limit: usize, word_docids_limit: usize,
@ -210,20 +218,12 @@ struct Store {
// MTBL sorters // MTBL sorters
main_sorter: Sorter<MergeFn>, main_sorter: Sorter<MergeFn>,
word_docids_sorter: Sorter<MergeFn>, word_docids_sorter: Sorter<MergeFn>,
docid_word_positions_sorter: Sorter<MergeFn>,
words_pairs_proximities_docids_sorter: Sorter<MergeFn>, words_pairs_proximities_docids_sorter: Sorter<MergeFn>,
// MTBL writers // MTBL writers
docid_word_positions_writer: Writer<File>,
documents_writer: Writer<File>, documents_writer: Writer<File>,
} }
struct Readers {
main: Reader<Mmap>,
word_docids: Reader<Mmap>,
docid_word_positions: Reader<Mmap>,
words_pairs_proximities_docids: Reader<Mmap>,
documents: Reader<Mmap>,
}
impl Store { impl Store {
pub fn new( pub fn new(
linked_hash_map_size: usize, linked_hash_map_size: usize,
@ -247,13 +247,6 @@ impl Store {
max_nb_chunks, max_nb_chunks,
max_memory, max_memory,
); );
let docid_word_positions_sorter = create_sorter(
docid_word_positions_merge,
chunk_compression_type,
chunk_compression_level,
max_nb_chunks,
max_memory,
);
let words_pairs_proximities_docids_sorter = create_sorter( let words_pairs_proximities_docids_sorter = create_sorter(
words_pairs_proximities_docids_merge, words_pairs_proximities_docids_merge,
chunk_compression_type, chunk_compression_type,
@ -262,12 +255,12 @@ impl Store {
max_memory, max_memory,
); );
let mut documents_builder = Writer::builder(); let documents_writer = tempfile().map(|f| {
documents_builder.compression_type(chunk_compression_type); create_writer(chunk_compression_type, chunk_compression_level, f)
if let Some(level) = chunk_compression_level { })?;
documents_builder.compression_level(level); let docid_word_positions_writer = tempfile().map(|f| {
} create_writer(chunk_compression_type, chunk_compression_level, f)
let documents_writer = tempfile().map(|f| documents_builder.build(f))?; })?;
Ok(Store { Ok(Store {
word_docids: LinkedHashMap::with_capacity(linked_hash_map_size), word_docids: LinkedHashMap::with_capacity(linked_hash_map_size),
@ -280,9 +273,9 @@ impl Store {
main_sorter, main_sorter,
word_docids_sorter, word_docids_sorter,
docid_word_positions_sorter,
words_pairs_proximities_docids_sorter, words_pairs_proximities_docids_sorter,
docid_word_positions_writer,
documents_writer, documents_writer,
}) })
} }
@ -372,7 +365,7 @@ impl Store {
self.documents_ids.insert(document_id); self.documents_ids.insert(document_id);
self.documents_writer.insert(document_id.to_be_bytes(), record)?; self.documents_writer.insert(document_id.to_be_bytes(), record)?;
Self::write_docid_word_positions(&mut self.docid_word_positions_sorter, document_id, words_positions)?; Self::write_docid_word_positions(&mut self.docid_word_positions_writer, document_id, words_positions)?;
Ok(()) Ok(())
} }
@ -406,7 +399,7 @@ impl Store {
} }
fn write_docid_word_positions( fn write_docid_word_positions(
sorter: &mut Sorter<MergeFn>, writer: &mut Writer<File>,
id: DocumentId, id: DocumentId,
words_positions: &HashMap<String, SmallVec32<Position>>, words_positions: &HashMap<String, SmallVec32<Position>>,
) -> anyhow::Result<()> ) -> anyhow::Result<()>
@ -415,6 +408,9 @@ impl Store {
let mut key = id.to_be_bytes().to_vec(); let mut key = id.to_be_bytes().to_vec();
let base_size = key.len(); let base_size = key.len();
// We order the words lexicographically, this way we avoid passing by a sorter.
let words_positions = BTreeMap::from_iter(words_positions);
for (word, positions) in words_positions { for (word, positions) in words_positions {
key.truncate(base_size); key.truncate(base_size);
key.extend_from_slice(word.as_bytes()); key.extend_from_slice(word.as_bytes());
@ -424,7 +420,7 @@ impl Store {
.with_context(|| "could not serialize positions")?; .with_context(|| "could not serialize positions")?;
// that we write under the generated key into MTBL // that we write under the generated key into MTBL
if lmdb_key_valid_size(&key) { if lmdb_key_valid_size(&key) {
sorter.insert(&key, &bytes)?; writer.insert(&key, &bytes)?;
} }
} }
@ -542,16 +538,13 @@ impl Store {
let mut main_wtr = tempfile().map(|f| create_writer(comp_type, comp_level, f))?; let mut main_wtr = tempfile().map(|f| create_writer(comp_type, comp_level, f))?;
self.main_sorter.write_into(&mut main_wtr)?; self.main_sorter.write_into(&mut main_wtr)?;
let mut docid_word_positions_wtr = tempfile().map(|f| create_writer(comp_type, comp_level, f))?;
self.docid_word_positions_sorter.write_into(&mut docid_word_positions_wtr)?;
let mut words_pairs_proximities_docids_wtr = tempfile().map(|f| create_writer(comp_type, comp_level, f))?; let mut words_pairs_proximities_docids_wtr = tempfile().map(|f| create_writer(comp_type, comp_level, f))?;
self.words_pairs_proximities_docids_sorter.write_into(&mut words_pairs_proximities_docids_wtr)?; self.words_pairs_proximities_docids_sorter.write_into(&mut words_pairs_proximities_docids_wtr)?;
let main = writer_into_reader(main_wtr)?; let main = writer_into_reader(main_wtr)?;
let word_docids = writer_into_reader(word_docids_wtr)?; let word_docids = writer_into_reader(word_docids_wtr)?;
let docid_word_positions = writer_into_reader(docid_word_positions_wtr)?;
let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr)?; let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr)?;
let docid_word_positions = writer_into_reader(self.docid_word_positions_writer)?;
let documents = writer_into_reader(self.documents_writer)?; let documents = writer_into_reader(self.documents_writer)?;
Ok(Readers { Ok(Readers {
@ -602,7 +595,7 @@ fn word_docids_merge(_key: &[u8], values: &[Vec<u8>]) -> Result<Vec<u8>, ()> {
} }
fn docid_word_positions_merge(key: &[u8], _values: &[Vec<u8>]) -> Result<Vec<u8>, ()> { fn docid_word_positions_merge(key: &[u8], _values: &[Vec<u8>]) -> Result<Vec<u8>, ()> {
panic!("merging word docid positions is an error ({:?})", key.as_bstr()) panic!("merging docid word positions is an error ({:?})", key.as_bstr())
} }
fn words_pairs_proximities_docids_merge(_key: &[u8], values: &[Vec<u8>]) -> Result<Vec<u8>, ()> { fn words_pairs_proximities_docids_merge(_key: &[u8], values: &[Vec<u8>]) -> Result<Vec<u8>, ()> {