From 21aafd603c1f44aac5d44c32281dfe2ea68b2624 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 29 Aug 2020 10:56:40 +0200 Subject: [PATCH] Make sure the first document is associated to the document id 0 --- src/bin/indexer.rs | 46 ++++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index 9306cc845..708c57b4c 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -434,32 +434,34 @@ fn index_csv( let mut document_id: usize = 0; let mut document = csv::StringRecord::new(); while rdr.read_record(&mut document)? { - document_id = document_id + 1; - // We skip documents that must not be indexed by this thread - if document_id % num_threads != thread_index { continue } - - let document_id = DocumentId::try_from(document_id).context("generated id is too big")?; - if document_id % (ONE_MILLION as u32) == 0 { - debug!("We have seen {}m documents so far ({:.02?}).", - document_id / ONE_MILLION as u32, before.elapsed()); - before = Instant::now(); - } - - for (attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) { - for (pos, word) in lexer::break_string(&content).enumerate().take(MAX_POSITION) { - let word = word.cow_to_lowercase(); - let position = (attr * MAX_POSITION + pos) as u32; - store.insert_word_position(&word, position)?; - store.insert_word_position_docid(&word, position, document_id)?; + // We skip documents that must not be indexed by this thread. + if document_id % num_threads == thread_index { + if document_id % ONE_MILLION == 0 { + debug!("We have seen {}m documents so far ({:.02?}).", + document_id / ONE_MILLION, before.elapsed()); + before = Instant::now(); } + + let document_id = DocumentId::try_from(document_id).context("generated id is too big")?; + for (attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) { + for (pos, word) in lexer::break_string(&content).enumerate().take(MAX_POSITION) { + let word = word.cow_to_lowercase(); + let position = (attr * MAX_POSITION + pos) as u32; + store.insert_word_position(&word, position)?; + store.insert_word_position_docid(&word, position, document_id)?; + } + } + + // We write the document in the database. + let mut writer = csv::WriterBuilder::new().has_headers(false).from_writer(Vec::new()); + writer.write_byte_record(document.as_byte_record())?; + let document = writer.into_inner()?; + store.write_document(document_id, &document)?; } - // We write the document in the database. - let mut writer = csv::WriterBuilder::new().has_headers(false).from_writer(Vec::new()); - writer.write_byte_record(document.as_byte_record())?; - let document = writer.into_inner()?; - store.write_document(document_id, &document)?; + // Compute the document id of the the next document. + document_id = document_id + 1; } let (reader, docs_reader) = store.finish()?;