From 99705deb7d7839efd0ccb6273976e609445f8a74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 4 Oct 2020 18:17:53 +0200 Subject: [PATCH] Directly use a writer for the docid word positions --- src/bin/indexer.rs | 59 ++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 33 deletions(-) diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index fbf315a75..9a452114a 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use std::collections::{BTreeMap, HashMap}; use std::convert::TryFrom; use std::fs::File; use std::io::{self, Read, Write}; @@ -86,7 +86,7 @@ struct IndexerOpt { max_nb_chunks: Option, /// MTBL max memory in bytes. - #[structopt(long, default_value = "346030080")] // 330 MB + #[structopt(long, default_value = "440401920")] // 420 MB max_memory: usize, /// Size of the linked hash map cache when indexing. @@ -198,6 +198,14 @@ fn compute_words_pair_proximities( type MergeFn = fn(&[u8], &[Vec]) -> Result, ()>; +struct Readers { + main: Reader, + word_docids: Reader, + docid_word_positions: Reader, + words_pairs_proximities_docids: Reader, + documents: Reader, +} + struct Store { word_docids: LinkedHashMap, RoaringBitmap>, word_docids_limit: usize, @@ -210,20 +218,12 @@ struct Store { // MTBL sorters main_sorter: Sorter, word_docids_sorter: Sorter, - docid_word_positions_sorter: Sorter, words_pairs_proximities_docids_sorter: Sorter, // MTBL writers + docid_word_positions_writer: Writer, documents_writer: Writer, } -struct Readers { - main: Reader, - word_docids: Reader, - docid_word_positions: Reader, - words_pairs_proximities_docids: Reader, - documents: Reader, -} - impl Store { pub fn new( linked_hash_map_size: usize, @@ -247,13 +247,6 @@ impl Store { max_nb_chunks, max_memory, ); - let docid_word_positions_sorter = create_sorter( - docid_word_positions_merge, - chunk_compression_type, - chunk_compression_level, - max_nb_chunks, - max_memory, - ); let words_pairs_proximities_docids_sorter = create_sorter( words_pairs_proximities_docids_merge, chunk_compression_type, @@ -262,12 +255,12 @@ impl Store { max_memory, ); - let mut documents_builder = Writer::builder(); - documents_builder.compression_type(chunk_compression_type); - if let Some(level) = chunk_compression_level { - documents_builder.compression_level(level); - } - let documents_writer = tempfile().map(|f| documents_builder.build(f))?; + let documents_writer = tempfile().map(|f| { + create_writer(chunk_compression_type, chunk_compression_level, f) + })?; + let docid_word_positions_writer = tempfile().map(|f| { + create_writer(chunk_compression_type, chunk_compression_level, f) + })?; Ok(Store { word_docids: LinkedHashMap::with_capacity(linked_hash_map_size), @@ -280,9 +273,9 @@ impl Store { main_sorter, word_docids_sorter, - docid_word_positions_sorter, words_pairs_proximities_docids_sorter, + docid_word_positions_writer, documents_writer, }) } @@ -372,7 +365,7 @@ impl Store { self.documents_ids.insert(document_id); self.documents_writer.insert(document_id.to_be_bytes(), record)?; - Self::write_docid_word_positions(&mut self.docid_word_positions_sorter, document_id, words_positions)?; + Self::write_docid_word_positions(&mut self.docid_word_positions_writer, document_id, words_positions)?; Ok(()) } @@ -406,7 +399,7 @@ impl Store { } fn write_docid_word_positions( - sorter: &mut Sorter, + writer: &mut Writer, id: DocumentId, words_positions: &HashMap>, ) -> anyhow::Result<()> @@ -415,6 +408,9 @@ impl Store { let mut key = id.to_be_bytes().to_vec(); let base_size = key.len(); + // We order the words lexicographically, this way we avoid passing by a sorter. + let words_positions = BTreeMap::from_iter(words_positions); + for (word, positions) in words_positions { key.truncate(base_size); key.extend_from_slice(word.as_bytes()); @@ -424,7 +420,7 @@ impl Store { .with_context(|| "could not serialize positions")?; // that we write under the generated key into MTBL if lmdb_key_valid_size(&key) { - sorter.insert(&key, &bytes)?; + writer.insert(&key, &bytes)?; } } @@ -542,16 +538,13 @@ impl Store { let mut main_wtr = tempfile().map(|f| create_writer(comp_type, comp_level, f))?; self.main_sorter.write_into(&mut main_wtr)?; - let mut docid_word_positions_wtr = tempfile().map(|f| create_writer(comp_type, comp_level, f))?; - self.docid_word_positions_sorter.write_into(&mut docid_word_positions_wtr)?; - let mut words_pairs_proximities_docids_wtr = tempfile().map(|f| create_writer(comp_type, comp_level, f))?; self.words_pairs_proximities_docids_sorter.write_into(&mut words_pairs_proximities_docids_wtr)?; let main = writer_into_reader(main_wtr)?; let word_docids = writer_into_reader(word_docids_wtr)?; - let docid_word_positions = writer_into_reader(docid_word_positions_wtr)?; let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr)?; + let docid_word_positions = writer_into_reader(self.docid_word_positions_writer)?; let documents = writer_into_reader(self.documents_writer)?; Ok(Readers { @@ -602,7 +595,7 @@ fn word_docids_merge(_key: &[u8], values: &[Vec]) -> Result, ()> { } fn docid_word_positions_merge(key: &[u8], _values: &[Vec]) -> Result, ()> { - panic!("merging word docid positions is an error ({:?})", key.as_bstr()) + panic!("merging docid word positions is an error ({:?})", key.as_bstr()) } fn words_pairs_proximities_docids_merge(_key: &[u8], values: &[Vec]) -> Result, ()> {