From 32f8908d711794346d4d86a524a72012d6302b09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 6 Jan 2019 18:03:47 +0100 Subject: [PATCH] feat: Reintroduce stopwords for the serializer --- examples/create-database.rs | 93 ++++++++++--- misc/en.stopwords.txt | 3 +- misc/fr.stopwords.txt | 163 +++++++++++++++++++++++ src/database/mod.rs | 36 +++-- src/database/serde/indexer_serializer.rs | 5 + src/database/serde/serializer.rs | 8 ++ src/database/update/builder.rs | 3 + 7 files changed, 276 insertions(+), 35 deletions(-) create mode 100644 misc/fr.stopwords.txt diff --git a/examples/create-database.rs b/examples/create-database.rs index ac86d2621..66b488c95 100644 --- a/examples/create-database.rs +++ b/examples/create-database.rs @@ -1,12 +1,13 @@ #[global_allocator] static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; +use std::io::{self, BufRead, BufReader}; use std::path::{Path, PathBuf}; use std::error::Error; use std::borrow::Cow; use std::fs::File; -use hashbrown::HashMap; +use hashbrown::{HashMap, HashSet}; use serde_derive::{Serialize, Deserialize}; use structopt::StructOpt; @@ -26,6 +27,13 @@ pub struct Opt { /// The path to the schema. #[structopt(long = "schema", parse(from_os_str))] pub schema_path: PathBuf, + + /// The path to the list of stop words (one by line). + #[structopt(long = "stop-words", parse(from_os_str))] + pub stop_words_path: Option, + + #[structopt(long = "update-group-size")] + pub update_group_size: Option, } #[derive(Serialize, Deserialize)] @@ -34,37 +42,75 @@ struct Document<'a> ( HashMap, Cow<'a, str>> ); -fn index(schema: Schema, database_path: &Path, csv_data_path: &Path) -> Result> { +fn index( + schema: Schema, + database_path: &Path, + csv_data_path: &Path, + update_group_size: Option, + stop_words: &HashSet, +) -> Result> +{ let database = Database::create(database_path, &schema)?; - println!("start indexing..."); - - let tokenizer_builder = DefaultBuilder::new(); - let update_path = tempfile::NamedTempFile::new()?; - let mut update = UpdateBuilder::new(update_path.path().to_path_buf(), schema); - let mut rdr = csv::Reader::from_path(csv_data_path)?; let mut raw_record = csv::StringRecord::new(); let headers = rdr.headers()?.clone(); - while rdr.read_record(&mut raw_record)? { - let document: Document = match raw_record.deserialize(Some(&headers)) { - Ok(document) => document, - Err(e) => { - eprintln!("{:?}", e); - continue; - } - }; + let mut i = 0; + let mut end_of_file = false; - update.update_document(&document, &tokenizer_builder)?; + while !end_of_file { + let tokenizer_builder = DefaultBuilder::new(); + let update_path = tempfile::NamedTempFile::new()?; + let mut update = UpdateBuilder::new(update_path.path().to_path_buf(), schema.clone()); + + loop { + end_of_file = !rdr.read_record(&mut raw_record)?; + if end_of_file { break } + + let document: Document = match raw_record.deserialize(Some(&headers)) { + Ok(document) => document, + Err(e) => { + eprintln!("{:?}", e); + continue; + } + }; + + update.update_document(&document, &tokenizer_builder, &stop_words)?; + + print!("\rindexing document {}", i); + i += 1; + + if let Some(group_size) = update_group_size { + if i % group_size == 0 { break } + } + } + + println!(); + + println!("building update..."); + let update = update.build()?; + println!("ingesting update..."); + database.ingest_update_file(update)?; } - let update = update.build()?; - database.ingest_update_file(update)?; - Ok(database) } +fn retrieve_stop_words(path: &Path) -> io::Result> { + let f = File::open(path)?; + let reader = BufReader::new(f); + let mut words = HashSet::new(); + + for line in reader.lines() { + let line = line?; + let word = line.trim().to_string(); + words.insert(word); + } + + Ok(words) +} + fn main() -> Result<(), Box> { let _ = env_logger::init(); let opt = Opt::from_args(); @@ -74,8 +120,13 @@ fn main() -> Result<(), Box> { Schema::from_toml(file)? }; + let stop_words = match opt.stop_words_path { + Some(ref path) => retrieve_stop_words(path)?, + None => HashSet::new(), + }; + let (elapsed, result) = elapsed::measure_time(|| { - index(schema, &opt.database_path, &opt.csv_data_path) + index(schema, &opt.database_path, &opt.csv_data_path, opt.update_group_size, &stop_words) }); if let Err(e) = result { diff --git a/misc/en.stopwords.txt b/misc/en.stopwords.txt index c32f9f9b3..c691414d2 100644 --- a/misc/en.stopwords.txt +++ b/misc/en.stopwords.txt @@ -95,7 +95,8 @@ or other ought our -ours ourselves +ours +ourselves out over own diff --git a/misc/fr.stopwords.txt b/misc/fr.stopwords.txt new file mode 100644 index 000000000..0bbf738f7 --- /dev/null +++ b/misc/fr.stopwords.txt @@ -0,0 +1,163 @@ +au +aux +avec +ce +ces +dans +de +des +du +elle +en +et +eux +il +je +la +le +leur +lui +ma +mais +me +même +mes +moi +mon +ne +nos +notre +nous +on +ou +par +pas +pour +qu +que +qui +sa +se +ses +son +sur +ta +te +tes +toi +ton +tu +un +une +vos +votre +vous +c +d +j +l +à +m +n +s +t +y +été +étée +étées +étés +étant +suis +es +est +sommes +êtes +sont +serai +seras +sera +serons +serez +seront +serais +serait +serions +seriez +seraient +étais +était +étions +étiez +étaient +fus +fut +fûmes +fûtes +furent +sois +soit +soyons +soyez +soient +fusse +fusses +fût +fussions +fussiez +fussent +ayant +eu +eue +eues +eus +ai +as +avons +avez +ont +aurai +auras +aura +aurons +aurez +auront +aurais +aurait +aurions +auriez +auraient +avais +avait +avions +aviez +avaient +eut +eûmes +eûtes +eurent +aie +aies +ait +ayons +ayez +aient +eusse +eusses +eût +eussions +eussiez +eussent +ceci +celà +cet +cette +ici +ils +les +leurs +quel +quels +quelle +quelles +sans +soi \ No newline at end of file diff --git a/src/database/mod.rs b/src/database/mod.rs index 6e03dd44d..bc86eede6 100644 --- a/src/database/mod.rs +++ b/src/database/mod.rs @@ -185,6 +185,7 @@ mod tests { use std::error::Error; use serde_derive::{Serialize, Deserialize}; + use hashbrown::HashSet; use tempfile::tempdir; use crate::database::schema::{SchemaBuilder, STORED, INDEXED}; @@ -194,6 +195,7 @@ mod tests { #[test] fn ingest_one_update_file() -> Result<(), Box> { let dir = tempdir()?; + let stop_words = HashSet::new(); let rocksdb_path = dir.path().join("rocksdb.rdb"); @@ -237,8 +239,8 @@ mod tests { let tokenizer_builder = DefaultBuilder::new(); let mut builder = UpdateBuilder::new(update_path, schema); - docid0 = builder.update_document(&doc0, &tokenizer_builder)?; - docid1 = builder.update_document(&doc1, &tokenizer_builder)?; + docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?; + docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?; builder.build()? }; @@ -258,6 +260,7 @@ mod tests { #[test] fn ingest_two_update_files() -> Result<(), Box> { let dir = tempdir()?; + let stop_words = HashSet::new(); let rocksdb_path = dir.path().join("rocksdb.rdb"); @@ -312,8 +315,8 @@ mod tests { let update_path = dir.path().join("update-000.sst"); let mut builder = UpdateBuilder::new(update_path, schema.clone()); - docid0 = builder.update_document(&doc0, &tokenizer_builder)?; - docid1 = builder.update_document(&doc1, &tokenizer_builder)?; + docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?; + docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?; builder.build()? }; @@ -325,8 +328,8 @@ mod tests { let update_path = dir.path().join("update-001.sst"); let mut builder = UpdateBuilder::new(update_path, schema); - docid2 = builder.update_document(&doc2, &tokenizer_builder)?; - docid3 = builder.update_document(&doc3, &tokenizer_builder)?; + docid2 = builder.update_document(&doc2, &tokenizer_builder, &stop_words)?; + docid3 = builder.update_document(&doc3, &tokenizer_builder, &stop_words)?; builder.build()? }; @@ -364,8 +367,9 @@ mod bench { use rand::distributions::Alphanumeric; use rand_xorshift::XorShiftRng; use rand::{Rng, SeedableRng}; - use rand::seq::SliceRandom; use serde_derive::Serialize; + use rand::seq::SliceRandom; + use hashbrown::HashSet; use crate::tokenizer::DefaultBuilder; use crate::database::update::UpdateBuilder; @@ -394,6 +398,7 @@ mod bench { #[bench] fn open_little_database(bench: &mut Bencher) -> Result<(), Box> { let dir = tempfile::tempdir()?; + let stop_words = HashSet::new(); let mut builder = SchemaBuilder::with_identifier("id"); builder.new_attribute("title", STORED | INDEXED); @@ -421,7 +426,7 @@ mod bench { title: random_sentences(rng.gen_range(1, 8), &mut rng), description: random_sentences(rng.gen_range(20, 200), &mut rng), }; - builder.update_document(&document, &tokenizer_builder)?; + builder.update_document(&document, &tokenizer_builder, &stop_words)?; } let update = builder.build()?; @@ -440,6 +445,7 @@ mod bench { #[bench] fn open_medium_database(bench: &mut Bencher) -> Result<(), Box> { let dir = tempfile::tempdir()?; + let stop_words = HashSet::new(); let mut builder = SchemaBuilder::with_identifier("id"); builder.new_attribute("title", STORED | INDEXED); @@ -467,7 +473,7 @@ mod bench { title: random_sentences(rng.gen_range(1, 8), &mut rng), description: random_sentences(rng.gen_range(20, 200), &mut rng), }; - builder.update_document(&document, &tokenizer_builder)?; + builder.update_document(&document, &tokenizer_builder, &stop_words)?; } let update = builder.build()?; @@ -487,6 +493,7 @@ mod bench { #[ignore] fn open_big_database(bench: &mut Bencher) -> Result<(), Box> { let dir = tempfile::tempdir()?; + let stop_words = HashSet::new(); let mut builder = SchemaBuilder::with_identifier("id"); builder.new_attribute("title", STORED | INDEXED); @@ -514,7 +521,7 @@ mod bench { title: random_sentences(rng.gen_range(1, 8), &mut rng), description: random_sentences(rng.gen_range(20, 200), &mut rng), }; - builder.update_document(&document, &tokenizer_builder)?; + builder.update_document(&document, &tokenizer_builder, &stop_words)?; } let update = builder.build()?; @@ -533,6 +540,7 @@ mod bench { #[bench] fn search_oneletter_little_database(bench: &mut Bencher) -> Result<(), Box> { let dir = tempfile::tempdir()?; + let stop_words = HashSet::new(); let mut builder = SchemaBuilder::with_identifier("id"); builder.new_attribute("title", STORED | INDEXED); @@ -560,7 +568,7 @@ mod bench { title: random_sentences(rng.gen_range(1, 8), &mut rng), description: random_sentences(rng.gen_range(20, 200), &mut rng), }; - builder.update_document(&document, &tokenizer_builder)?; + builder.update_document(&document, &tokenizer_builder, &stop_words)?; } let update = builder.build()?; @@ -579,6 +587,7 @@ mod bench { #[bench] fn search_oneletter_medium_database(bench: &mut Bencher) -> Result<(), Box> { let dir = tempfile::tempdir()?; + let stop_words = HashSet::new(); let mut builder = SchemaBuilder::with_identifier("id"); builder.new_attribute("title", STORED | INDEXED); @@ -606,7 +615,7 @@ mod bench { title: random_sentences(rng.gen_range(1, 8), &mut rng), description: random_sentences(rng.gen_range(20, 200), &mut rng), }; - builder.update_document(&document, &tokenizer_builder)?; + builder.update_document(&document, &tokenizer_builder, &stop_words)?; } let update = builder.build()?; @@ -626,6 +635,7 @@ mod bench { #[ignore] fn search_oneletter_big_database(bench: &mut Bencher) -> Result<(), Box> { let dir = tempfile::tempdir()?; + let stop_words = HashSet::new(); let mut builder = SchemaBuilder::with_identifier("id"); builder.new_attribute("title", STORED | INDEXED); @@ -653,7 +663,7 @@ mod bench { title: random_sentences(rng.gen_range(1, 8), &mut rng), description: random_sentences(rng.gen_range(20, 200), &mut rng), }; - builder.update_document(&document, &tokenizer_builder)?; + builder.update_document(&document, &tokenizer_builder, &stop_words)?; } let update = builder.build()?; diff --git a/src/database/serde/indexer_serializer.rs b/src/database/serde/indexer_serializer.rs index 1cea2da67..7bbcca7e7 100644 --- a/src/database/serde/indexer_serializer.rs +++ b/src/database/serde/indexer_serializer.rs @@ -5,6 +5,7 @@ use crate::tokenizer::TokenizerBuilder; use crate::tokenizer::Token; use crate::{DocumentId, DocIndex, Attribute, WordArea}; +use hashbrown::HashSet; use serde::Serialize; use serde::ser; @@ -13,6 +14,7 @@ pub struct IndexerSerializer<'a, B> { pub update: &'a mut DocumentUpdate, pub document_id: DocumentId, pub attribute: SchemaAttr, + pub stop_words: &'a HashSet, } impl<'a, B> ser::Serializer for IndexerSerializer<'a, B> @@ -48,6 +50,7 @@ where B: TokenizerBuilder fn serialize_str(self, v: &str) -> Result { for Token { word, word_index, char_index } in self.tokenizer_builder.build(v) { + // FIXME must u32::try_from instead let attribute = match Attribute::new(self.attribute.0, word_index as u32) { Ok(attribute) => attribute, @@ -69,6 +72,8 @@ where B: TokenizerBuilder // insert the exact representation let word_lower = word.to_lowercase(); + if self.stop_words.contains(&word_lower) { continue } + // and the unidecoded lowercased version let word_unidecoded = unidecode::unidecode(word).to_lowercase(); if word_lower != word_unidecoded { diff --git a/src/database/serde/serializer.rs b/src/database/serde/serializer.rs index 074aba23c..0019f4497 100644 --- a/src/database/serde/serializer.rs +++ b/src/database/serde/serializer.rs @@ -1,3 +1,4 @@ +use hashbrown::HashSet; use serde::Serialize; use serde::ser; @@ -14,6 +15,7 @@ pub struct Serializer<'a, B> { pub update: &'a mut DocumentUpdate, pub document_id: DocumentId, pub tokenizer_builder: &'a B, + pub stop_words: &'a HashSet, } impl<'a, B> ser::Serializer for Serializer<'a, B> @@ -139,6 +141,7 @@ where B: TokenizerBuilder document_id: self.document_id, update: self.update, tokenizer_builder: self.tokenizer_builder, + stop_words: self.stop_words, current_key_name: None, }) } @@ -154,6 +157,7 @@ where B: TokenizerBuilder update: self.update, document_id: self.document_id, tokenizer_builder: self.tokenizer_builder, + stop_words: self.stop_words, }) } @@ -174,6 +178,7 @@ pub struct MapSerializer<'a, B> { pub document_id: DocumentId, pub update: &'a mut DocumentUpdate, pub tokenizer_builder: &'a B, + pub stop_words: &'a HashSet, pub current_key_name: Option, } @@ -219,6 +224,7 @@ where B: TokenizerBuilder tokenizer_builder: self.tokenizer_builder, document_id: self.document_id, attribute: attr, + stop_words: self.stop_words, }; value.serialize(serializer)?; } @@ -237,6 +243,7 @@ pub struct StructSerializer<'a, B> { pub document_id: DocumentId, pub update: &'a mut DocumentUpdate, pub tokenizer_builder: &'a B, + pub stop_words: &'a HashSet, } impl<'a, B> ser::SerializeStruct for StructSerializer<'a, B> @@ -264,6 +271,7 @@ where B: TokenizerBuilder tokenizer_builder: self.tokenizer_builder, document_id: self.document_id, attribute: attr, + stop_words: self.stop_words, }; value.serialize(serializer)?; } diff --git a/src/database/update/builder.rs b/src/database/update/builder.rs index 16a805a67..bd7f8a567 100644 --- a/src/database/update/builder.rs +++ b/src/database/update/builder.rs @@ -1,6 +1,7 @@ use std::path::PathBuf; use std::error::Error; +use hashbrown::HashSet; use serde::Serialize; use crate::database::serde::serializer::Serializer; @@ -28,6 +29,7 @@ impl UpdateBuilder { &mut self, document: T, tokenizer_builder: &B, + stop_words: &HashSet, ) -> Result where T: Serialize, B: TokenizerBuilder, @@ -40,6 +42,7 @@ impl UpdateBuilder { document_id: document_id, tokenizer_builder: tokenizer_builder, update: update, + stop_words: stop_words, }; document.serialize(serializer)?;