From 15208c7d3d03f674ecebf23e5ce192c04a4c0aeb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 22 Sep 2020 10:33:25 +0200 Subject: [PATCH] Simplify the indexer record loop --- src/bin/indexer.rs | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index c216c2778..75ce5347d 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -193,21 +193,31 @@ impl Store { fn write_document( &mut self, - id: DocumentId, - iter: impl IntoIterator, + document_id: DocumentId, + words_positions: &HashMap, record: &StringRecord, ) -> anyhow::Result<()> { + // We store document_id associated with all the words the record contains. + for (word, _) in words_positions { + self.insert_word_docid(word, document_id)?; + } + let record = CsvStringRecordCodec::bytes_encode(record) - .with_context(|| format!("could not encode csv record"))?; - self.documents_ids.insert(id); - self.documents_sorter.insert(id.to_be_bytes(), record)?; - Self::write_docid_word_positions(&mut self.sorter, id, iter)?; + .with_context(|| format!("could not encode CSV record"))?; + + self.documents_ids.insert(document_id); + self.documents_sorter.insert(document_id.to_be_bytes(), record)?; + Self::write_docid_word_positions(&mut self.sorter, document_id, words_positions)?; + Ok(()) } - fn write_docid_word_positions(sorter: &mut Sorter, id: DocumentId, iter: I) -> anyhow::Result<()> - where I: IntoIterator + fn write_docid_word_positions( + sorter: &mut Sorter, + id: DocumentId, + words_positions: &HashMap, + ) -> anyhow::Result<()> { // postings positions ids keys are all prefixed let mut key = vec![WORD_DOCID_POSITIONS_BYTE]; @@ -216,7 +226,7 @@ impl Store { key.extend_from_slice(&id.to_be_bytes()); let base_size = key.len(); - for (word, positions) in iter { + for (word, positions) in words_positions { key.truncate(base_size); key.extend_from_slice(word.as_bytes()); // We serialize the positions into a buffer. @@ -278,8 +288,8 @@ impl Store { let mut document_id: usize = 0; let mut document = csv::StringRecord::new(); let mut word_positions = HashMap::new(); - while rdr.read_record(&mut document)? { + while rdr.read_record(&mut document)? { // We skip documents that must not be indexed by this thread. if document_id % num_threads == thread_index { if document_id % ONE_MILLION == 0 { @@ -293,13 +303,13 @@ impl Store { for (pos, token) in simple_tokenizer(&content).filter_map(only_token).enumerate().take(MAX_POSITION) { let word = token.to_lowercase(); let position = (attr * MAX_POSITION + pos) as u32; - self.insert_word_docid(&word, document_id)?; word_positions.entry(word).or_insert_with(RoaringBitmap::new).insert(position); } } // We write the document in the documents store. - self.write_document(document_id, word_positions.drain(), &document)?; + self.write_document(document_id, &word_positions, &document)?; + word_positions.clear(); } // Compute the document id of the next document.