diff --git a/raptor-http/src/main.rs b/raptor-http/src/main.rs index 75bbacb72..ba0a136bb 100644 --- a/raptor-http/src/main.rs +++ b/raptor-http/src/main.rs @@ -1,7 +1,10 @@ #[macro_use] extern crate serde_derive; use std::env; -use std::io::Write; +use std::fs::File; +use std::path::Path; +use std::collections::hash_set::HashSet; +use std::io::{self, BufReader, BufRead, Write}; use std::sync::Arc; use std::error::Error; use std::str::from_utf8_unchecked; @@ -20,12 +23,28 @@ struct Document<'a> { description: &'a str, } -fn search(metadata: M, database: D, query: &str) -> Result> +type CommonWords = HashSet; + +fn common_words

(path: P) -> io::Result +where P: AsRef, +{ + let file = File::open(path)?; + let file = BufReader::new(file); + let mut set = HashSet::new(); + for line in file.lines().filter_map(|l| l.ok()) { + for word in line.split_whitespace() { + set.insert(word.to_owned()); + } + } + Ok(set) +} + +fn search(metadata: M, database: D, common_words: &CommonWords, query: &str) -> Result> where M: AsRef, D: AsRef, { let mut automatons = Vec::new(); - for query in query.split_whitespace() { + for query in query.split_whitespace().filter(|q| !common_words.contains(*q)) { let lev = automaton::build(query); automatons.push(lev); } @@ -78,10 +97,12 @@ fn main() { let db = DB::open_for_read_only(DBOptions::default(), rocksdb, false).unwrap(); let db = Arc::new(db); + let common_words = common_words("fr.stopwords.txt").unwrap(); + let routes = warp::path("search") .and(warp::query()) .map(move |query: SearchQuery| { - let body = search(meta.clone(), db.clone(), &query.query).unwrap(); + let body = search(meta.clone(), db.clone(), &common_words, &query.query).unwrap(); body }) .with(warp::reply::with::header("Content-Type", "application/json")); diff --git a/raptor-indexer/src/main.rs b/raptor-indexer/src/main.rs index ad6304341..6609ab3f3 100644 --- a/raptor-indexer/src/main.rs +++ b/raptor-indexer/src/main.rs @@ -3,9 +3,10 @@ #[macro_use] extern crate serde_derive; +use std::path::Path; use std::collections::{HashSet, BTreeMap}; +use std::io::{self, BufReader, BufRead}; use std::fs::File; -use std::io::{BufReader, BufRead}; use std::iter; use raptor::{MetadataBuilder, DocIndex}; @@ -20,28 +21,31 @@ struct Product { ft: String, } +type CommonWords = HashSet; + +fn common_words

(path: P) -> io::Result +where P: AsRef, +{ + let file = File::open(path)?; + let file = BufReader::new(file); + let mut set = HashSet::new(); + for line in file.lines().filter_map(|l| l.ok()) { + for word in line.split_whitespace() { + set.insert(word.to_owned()); + } + } + Ok(set) +} + fn main() { let data = File::open("products.json_lines").unwrap(); let data = BufReader::new(data); - let common_words = { - match File::open("fr.stopwords.txt") { - Ok(file) => { - let file = BufReader::new(file); - let mut set = HashSet::new(); - for line in file.lines().filter_map(|l| l.ok()) { - for word in line.split_whitespace() { - set.insert(word.to_owned()); - } - } - set - }, - Err(e) => { - eprintln!("{:?}", e); - HashSet::new() - }, - } - }; + let common_path = "fr.stopwords.txt"; + let common_words = common_words(common_path).unwrap_or_else(|e| { + println!("{:?}: {:?}", common_path, e); + HashSet::new() + }); // TODO add a subcommand to pack these files in a tar.xxx archive let random_name = moby_name_gen::random_name(); diff --git a/raptor-search/src/main.rs b/raptor-search/src/main.rs index 9ab993c60..b04cbcefb 100644 --- a/raptor-search/src/main.rs +++ b/raptor-search/src/main.rs @@ -1,14 +1,33 @@ use std::env; +use std::fs::File; +use std::path::Path; +use std::collections::HashSet; use std::str::from_utf8_unchecked; -use std::io::{self, Write}; +use std::io::{self, BufReader, BufRead, Write}; use elapsed::measure_time; use fst::Streamer; use rocksdb::{DB, DBOptions, IngestExternalFileOptions}; use raptor::{automaton, Metadata, RankedStream}; -fn search(metadata: &Metadata, database: &DB, query: &str) { +type CommonWords = HashSet; + +fn common_words

(path: P) -> io::Result +where P: AsRef, +{ + let file = File::open(path)?; + let file = BufReader::new(file); + let mut set = HashSet::new(); + for line in file.lines().filter_map(|l| l.ok()) { + for word in line.split_whitespace() { + set.insert(word.to_owned()); + } + } + Ok(set) +} + +fn search(metadata: &Metadata, database: &DB, common_words: &CommonWords, query: &str) { let mut automatons = Vec::new(); - for query in query.split_whitespace() { + for query in query.split_whitespace().filter(|q| !common_words.contains(*q)) { let lev = automaton::build(query); automatons.push(lev); } @@ -47,6 +66,12 @@ fn main() { }); println!("{} to load the SST file in RocksDB and reopen it for read-only", elapsed); + let common_path = "fr.stopwords.txt"; + let common_words = common_words(common_path).unwrap_or_else(|e| { + println!("{:?}: {:?}", common_path, e); + HashSet::new() + }); + loop { print!("Searching for: "); io::stdout().flush().unwrap(); @@ -57,7 +82,7 @@ fn main() { if query.is_empty() { break } - let (elapsed, _) = measure_time(|| search(&meta, &db, &query)); + let (elapsed, _) = measure_time(|| search(&meta, &db, &common_words, &query)); println!("Finished in {}", elapsed); } }