From 8a0c82d51e9aefd348cf13433f7a5bd7ba66b557 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 27 Sep 2018 16:59:41 +0200 Subject: [PATCH] feat: Use the new Tokenizer in the csv-indexer --- raptor-indexer-csv/src/main.rs | 55 +++++++++++++++++++--------------- raptor/src/lib.rs | 1 + 2 files changed, 32 insertions(+), 24 deletions(-) diff --git a/raptor-indexer-csv/src/main.rs b/raptor-indexer-csv/src/main.rs index b26dfcd9f..763a4c962 100644 --- a/raptor-indexer-csv/src/main.rs +++ b/raptor-indexer-csv/src/main.rs @@ -7,11 +7,10 @@ use std::path::{Path, PathBuf}; use std::collections::{HashSet, BTreeMap}; use std::io::{self, BufReader, BufRead}; use std::fs::File; -use std::iter; use csv::ReaderBuilder; use structopt::StructOpt; -use raptor::{MetadataBuilder, DocIndex}; +use raptor::{MetadataBuilder, DocIndex, Tokenizer}; use rocksdb::{SstFileWriter, EnvOptions, ColumnFamilyOptions}; use unidecode::unidecode; @@ -55,6 +54,30 @@ where P: AsRef, Ok(set) } +fn insert_document_words<'a, I, A, B>(builder: &mut MetadataBuilder, doc_index: u64, attr: u8, words: I) +where A: io::Write, + B: io::Write, + I: IntoIterator, +{ + for (index, word) in words { + let doc_index = DocIndex { + document: doc_index, + attribute: attr, + attribute_index: index as u32, + }; + // insert the exact representation + let word_lower = word.to_lowercase(); + + // and the unidecoded lowercased version + let word_unidecoded = unidecode(word).to_lowercase(); + if word_lower != word_unidecoded { + builder.insert(word_unidecoded, doc_index); + } + + builder.insert(word_lower, doc_index); + } +} + fn main() { let opt = Opt::from_args(); @@ -85,29 +108,13 @@ fn main() { Err(e) => { eprintln!("{:?}", e); errors += 1; continue }, }; - { - let title = iter::repeat(0).zip(product.title.split_whitespace()).filter(|&(_, w)| !common_words.contains(w)).enumerate(); - let description = iter::repeat(1).zip(product.description.split_whitespace()).filter(|&(_, w)| !common_words.contains(w)).enumerate(); + let title = Tokenizer::new(&product.title); + let title = title.iter().filter(|&(_, w)| !common_words.contains(w)); + insert_document_words(&mut builder, product.id, 0, title); - let words = title.chain(description); - for (i, (attr, word)) in words { - let doc_index = DocIndex { - document: product.id, - attribute: attr, - attribute_index: i as u32, - }; - // insert the exact representation - let word_lower = word.to_lowercase(); - - // and the unidecoded lowercased version - let word_unidecoded = unidecode(word).to_lowercase(); - if word_lower != word_unidecoded { - builder.insert(word_unidecoded, doc_index); - } - - builder.insert(word_lower, doc_index); - } - } + let description = Tokenizer::new(&product.description); + let description = description.iter().filter(|&(_, w)| !common_words.contains(w)); + insert_document_words(&mut builder, product.id, 1, description); // TODO simplify this by using functions and // use the MetadataBuilder internal BTreeMap ? diff --git a/raptor/src/lib.rs b/raptor/src/lib.rs index 999218169..122c1f69e 100644 --- a/raptor/src/lib.rs +++ b/raptor/src/lib.rs @@ -8,6 +8,7 @@ pub mod tokenizer; pub use self::metadata::{Metadata, MetadataBuilder}; pub use self::rank::RankedStream; +pub use self::tokenizer::Tokenizer; pub type DocumentId = u64;