From a066c084fef64cac6425394454f195cc5535d276 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 27 Sep 2018 17:01:16 +0200 Subject: [PATCH] feat: Use the new Tokenizer in the json-line-indexer --- raptor-indexer/src/main.rs | 56 ++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/raptor-indexer/src/main.rs b/raptor-indexer/src/main.rs index a6da19cbb..78c37450e 100644 --- a/raptor-indexer/src/main.rs +++ b/raptor-indexer/src/main.rs @@ -7,9 +7,9 @@ use std::path::Path; use std::collections::{HashSet, BTreeMap}; use std::io::{self, BufReader, BufRead}; use std::fs::File; -use std::{env, iter}; +use std::env; -use raptor::{MetadataBuilder, DocIndex}; +use raptor::{MetadataBuilder, DocIndex, Tokenizer}; use rocksdb::{SstFileWriter, EnvOptions, ColumnFamilyOptions}; use serde_json::from_str; use unidecode::unidecode; @@ -37,6 +37,30 @@ where P: AsRef, Ok(set) } +fn insert_document_words<'a, I, A, B>(builder: &mut MetadataBuilder, doc_index: u64, attr: u8, words: I) +where A: io::Write, + B: io::Write, + I: IntoIterator, +{ + for (index, word) in words { + let doc_index = DocIndex { + document: doc_index, + attribute: attr, + attribute_index: index as u32, + }; + // insert the exact representation + let word_lower = word.to_lowercase(); + + // and the unidecoded lowercased version + let word_unidecoded = unidecode(word).to_lowercase(); + if word_lower != word_unidecoded { + builder.insert(word_unidecoded, doc_index); + } + + builder.insert(word_lower, doc_index); + } +} + fn main() { let data_path = env::args().nth(1).expect("Missing data json lines file (e.g. products.json_lines)"); let data = File::open(data_path).unwrap(); @@ -69,29 +93,13 @@ fn main() { let product: Product = from_str(&line).unwrap(); - { - let title = iter::repeat(0).zip(product.title.split_whitespace()).filter(|&(_, w)| !common_words.contains(w)).enumerate(); - let description = iter::repeat(1).zip(product.ft.split_whitespace()).filter(|&(_, w)| !common_words.contains(w)).enumerate(); + let title = Tokenizer::new(&product.title); + let title = title.iter().filter(|&(_, w)| !common_words.contains(w)); + insert_document_words(&mut builder, product.group_id, 0, title); - let words = title.chain(description); - for (i, (attr, word)) in words { - let doc_index = DocIndex { - document: product.group_id, - attribute: attr, - attribute_index: i as u32, - }; - // insert the exact representation - let word_lower = word.to_lowercase(); - - // and the unidecoded lowercased version - let word_unidecoded = unidecode(word).to_lowercase(); - if word_lower != word_unidecoded { - builder.insert(word_unidecoded, doc_index); - } - - builder.insert(word_lower, doc_index); - } - } + let description = Tokenizer::new(&product.ft); + let description = description.iter().filter(|&(_, w)| !common_words.contains(w)); + insert_document_words(&mut builder, product.group_id, 1, description); // TODO simplify this by using functions and // use the MetadataBuilder internal BTreeMap ?