From c56c35b45b0080a055c1c2d6f4e478944eeae45d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 10 Oct 2018 16:57:21 +0200 Subject: [PATCH] feat: Introduce the Criteria struct --- examples/serve-console.rs | 16 +- examples/serve-http.rs | 19 ++- src/lib.rs | 1 - src/rank/{ => criterion}/exact.rs | 0 src/rank/criterion/mod.rs | 71 ++++++++ src/rank/{ => criterion}/number_of_words.rs | 0 src/rank/{ => criterion}/sum_of_typos.rs | 0 .../{ => criterion}/sum_of_words_attribute.rs | 0 .../{ => criterion}/sum_of_words_position.rs | 0 src/rank/{ => criterion}/words_proximity.rs | 0 src/rank/mod.rs | 154 +----------------- src/rank/ranked_stream.rs | 141 ++++++++++++++++ 12 files changed, 243 insertions(+), 159 deletions(-) rename src/rank/{ => criterion}/exact.rs (100%) create mode 100644 src/rank/criterion/mod.rs rename src/rank/{ => criterion}/number_of_words.rs (100%) rename src/rank/{ => criterion}/sum_of_typos.rs (100%) rename src/rank/{ => criterion}/sum_of_words_attribute.rs (100%) rename src/rank/{ => criterion}/sum_of_words_position.rs (100%) rename src/rank/{ => criterion}/words_proximity.rs (100%) create mode 100644 src/rank/ranked_stream.rs diff --git a/examples/serve-console.rs b/examples/serve-console.rs index 5ef119013..21def0676 100644 --- a/examples/serve-console.rs +++ b/examples/serve-console.rs @@ -6,7 +6,8 @@ use std::path::PathBuf; use fst::Streamer; use elapsed::measure_time; use rocksdb::{DB, DBOptions, IngestExternalFileOptions}; -use raptor::{automaton, Metadata, RankedStream, CommonWords}; +use raptor::{automaton, Metadata, CommonWords}; +use raptor::rank; #[derive(Debug, StructOpt)] pub struct CommandConsole { @@ -69,14 +70,21 @@ fn search(metadata: &Metadata, database: &DB, common_words: &CommonWords, query: automatons.push(lev); } - let mut stream = RankedStream::new(&metadata, automatons, 20); + let config = rank::Config { + criteria: rank::criterion::default(), + metadata: &metadata, + automatons: automatons, + limit: 20, + }; + + let mut stream = rank::RankedStream::new(config); while let Some(document) = stream.next() { - let id_key = format!("{}-id", document.document_id); + let id_key = format!("{}-id", document.id); let id = database.get(id_key.as_bytes()).unwrap().unwrap(); let id = unsafe { from_utf8_unchecked(&id) }; print!("{} ", id); - let title_key = format!("{}-title", document.document_id); + let title_key = format!("{}-title", document.id); let title = database.get(title_key.as_bytes()).unwrap().unwrap(); let title = unsafe { from_utf8_unchecked(&title) }; println!("{:?}", title); diff --git a/examples/serve-http.rs b/examples/serve-http.rs index 7bad1dd99..4581f512b 100644 --- a/examples/serve-http.rs +++ b/examples/serve-http.rs @@ -7,7 +7,7 @@ use std::path::PathBuf; use std::error::Error; use std::sync::Arc; -use raptor::rank::RankedStream; +use raptor::rank; use raptor::{automaton, Metadata, CommonWords}; use rocksdb::{DB, DBOptions, IngestExternalFileOptions}; use fst::Streamer; @@ -100,26 +100,33 @@ where M: AsRef, automatons.push(lev); } - let mut stream = RankedStream::new(metadata.as_ref(), automatons, 20); + let config = rank::Config { + criteria: rank::criterion::default(), + metadata: metadata.as_ref(), + automatons: automatons, + limit: 20, + }; + + let mut stream = rank::RankedStream::new(config); let mut body = Vec::new(); write!(&mut body, "[")?; let mut first = true; while let Some(document) = stream.next() { - let title_key = format!("{}-title", document.document_id); + let title_key = format!("{}-title", document.id); let title = database.as_ref().get(title_key.as_bytes()).unwrap().unwrap(); let title = unsafe { from_utf8_unchecked(&title) }; - let description_key = format!("{}-description", document.document_id); + let description_key = format!("{}-description", document.id); let description = database.as_ref().get(description_key.as_bytes()).unwrap().unwrap(); let description = unsafe { from_utf8_unchecked(&description) }; - let image_key = format!("{}-image", document.document_id); + let image_key = format!("{}-image", document.id); let image = database.as_ref().get(image_key.as_bytes()).unwrap().unwrap(); let image = unsafe { from_utf8_unchecked(&image) }; let document = Document { - id: document.document_id, + id: document.id, title: title, description: description, image: image, diff --git a/src/lib.rs b/src/lib.rs index 84b325077..ab4597301 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,7 +8,6 @@ pub mod tokenizer; mod common_words; pub use self::metadata::{Metadata, MetadataBuilder}; -pub use self::rank::RankedStream; pub use self::tokenizer::Tokenizer; pub use self::common_words::CommonWords; diff --git a/src/rank/exact.rs b/src/rank/criterion/exact.rs similarity index 100% rename from src/rank/exact.rs rename to src/rank/criterion/exact.rs diff --git a/src/rank/criterion/mod.rs b/src/rank/criterion/mod.rs new file mode 100644 index 000000000..31b188d63 --- /dev/null +++ b/src/rank/criterion/mod.rs @@ -0,0 +1,71 @@ +mod sum_of_typos; +mod number_of_words; +mod words_proximity; +mod sum_of_words_attribute; +mod sum_of_words_position; +mod exact; + +use std::vec; +use std::cmp::Ordering; +use crate::rank::Document; + +pub use self::{ + sum_of_typos::sum_of_typos, + number_of_words::number_of_words, + words_proximity::words_proximity, + sum_of_words_attribute::sum_of_words_attribute, + sum_of_words_position::sum_of_words_position, + exact::exact, +}; + +#[inline] +pub fn document_id(lhs: &Document, rhs: &Document) -> Ordering { + lhs.id.cmp(&rhs.id) +} + +#[derive(Debug)] +pub struct Criteria(Vec); + +impl Criteria { + pub fn new() -> Self { + Criteria(Vec::new()) + } + + pub fn with_capacity(cap: usize) -> Self { + Criteria(Vec::with_capacity(cap)) + } + + pub fn push(&mut self, criterion: F) { + self.0.push(criterion) + } + + pub fn add(mut self, criterion: F) -> Self { + self.push(criterion); + self + } +} + +impl IntoIterator for Criteria { + type Item = F; + type IntoIter = vec::IntoIter; + + fn into_iter(self) -> Self::IntoIter { + self.0.into_iter() + } +} + +pub fn default() -> Criteria Ordering + Copy> { + let functions = &[ + sum_of_typos, + number_of_words, + words_proximity, + sum_of_words_attribute, + sum_of_words_position, + exact, + document_id, + ]; + + let mut criteria = Criteria::with_capacity(functions.len()); + for f in functions { criteria.push(f) } + criteria +} diff --git a/src/rank/number_of_words.rs b/src/rank/criterion/number_of_words.rs similarity index 100% rename from src/rank/number_of_words.rs rename to src/rank/criterion/number_of_words.rs diff --git a/src/rank/sum_of_typos.rs b/src/rank/criterion/sum_of_typos.rs similarity index 100% rename from src/rank/sum_of_typos.rs rename to src/rank/criterion/sum_of_typos.rs diff --git a/src/rank/sum_of_words_attribute.rs b/src/rank/criterion/sum_of_words_attribute.rs similarity index 100% rename from src/rank/sum_of_words_attribute.rs rename to src/rank/criterion/sum_of_words_attribute.rs diff --git a/src/rank/sum_of_words_position.rs b/src/rank/criterion/sum_of_words_position.rs similarity index 100% rename from src/rank/sum_of_words_position.rs rename to src/rank/criterion/sum_of_words_position.rs diff --git a/src/rank/words_proximity.rs b/src/rank/criterion/words_proximity.rs similarity index 100% rename from src/rank/words_proximity.rs rename to src/rank/criterion/words_proximity.rs diff --git a/src/rank/mod.rs b/src/rank/mod.rs index e7a50003d..0fe544ae1 100644 --- a/src/rank/mod.rs +++ b/src/rank/mod.rs @@ -1,29 +1,9 @@ -mod sum_of_typos; -mod number_of_words; -mod words_proximity; -mod sum_of_words_attribute; -mod sum_of_words_position; -mod exact; +pub mod criterion; +mod ranked_stream; -use std::cmp::Ordering; -use std::rc::Rc; -use std::{mem, vec}; -use fst::Streamer; -use fnv::FnvHashMap; -use group_by::GroupByMut; -use crate::automaton::{DfaExt, AutomatonExt}; -use crate::metadata::Metadata; -use crate::metadata::ops::{OpBuilder, Union}; use crate::{Match, DocumentId}; -use self::{ - sum_of_typos::sum_of_typos, - number_of_words::number_of_words, - words_proximity::words_proximity, - sum_of_words_attribute::sum_of_words_attribute, - sum_of_words_position::sum_of_words_position, - exact::exact, -}; +pub use self::ranked_stream::{RankedStream, Config}; #[inline] fn match_query_index(a: &Match, b: &Match) -> bool { @@ -32,7 +12,7 @@ fn match_query_index(a: &Match, b: &Match) -> bool { #[derive(Debug, Clone)] pub struct Document { - pub document_id: DocumentId, + pub id: DocumentId, pub matches: Vec, } @@ -41,129 +21,7 @@ impl Document { Self::from_sorted_matches(doc, vec![match_]) } - pub fn from_sorted_matches(doc: DocumentId, matches: Vec) -> Self { - Self { - document_id: doc, - matches: matches, - } - } -} - -fn matches_into_iter(matches: FnvHashMap>, limit: usize) -> vec::IntoIter { - let mut documents: Vec<_> = matches.into_iter().map(|(id, mut matches)| { - matches.sort_unstable(); - Document::from_sorted_matches(id, matches) - }).collect(); - - let sorts = &[ - sum_of_typos, - number_of_words, - words_proximity, - sum_of_words_attribute, - sum_of_words_position, - exact, - ]; - - let mut groups = vec![documents.as_mut_slice()]; - - for sort in sorts { - let temp = mem::replace(&mut groups, Vec::new()); - let mut computed = 0; - - 'grp: for group in temp { - group.sort_unstable_by(sort); - for group in GroupByMut::new(group, |a, b| sort(a, b) == Ordering::Equal) { - computed += group.len(); - groups.push(group); - if computed >= limit { break 'grp } - } - } - } - - documents.truncate(limit); - documents.into_iter() -} - -pub struct RankedStream<'m>(RankedStreamInner<'m>); - -impl<'m> RankedStream<'m> { - pub fn new(metadata: &'m Metadata, automatons: Vec, limit: usize) -> Self { - let automatons: Vec<_> = automatons.into_iter().map(Rc::new).collect(); - let mut builder = OpBuilder::with_automatons(automatons.clone()); - builder.push(metadata); - - let inner = RankedStreamInner::Fed { - inner: builder.union(), - automatons: automatons, - limit: limit, - matches: FnvHashMap::default(), - }; - - RankedStream(inner) - } -} - -impl<'m, 'a> fst::Streamer<'a> for RankedStream<'m> { - type Item = Document; - - fn next(&'a mut self) -> Option { - self.0.next() - } -} - -enum RankedStreamInner<'m> { - Fed { - inner: Union<'m>, - automatons: Vec>, - limit: usize, - matches: FnvHashMap>, - }, - Pours { - inner: vec::IntoIter, - }, -} - -impl<'m, 'a> fst::Streamer<'a> for RankedStreamInner<'m> { - type Item = Document; - - fn next(&'a mut self) -> Option { - loop { - match self { - RankedStreamInner::Fed { inner, automatons, limit, matches } => { - match inner.next() { - Some((string, indexed_values)) => { - for iv in indexed_values { - - let automaton = &automatons[iv.index]; - let distance = automaton.eval(string).to_u8(); - let same_length = string.len() == automaton.query_len(); - - for di in iv.doc_indexes.as_slice() { - let match_ = Match { - query_index: iv.index as u32, - distance: distance, - attribute: di.attribute, - attribute_index: di.attribute_index, - is_exact: distance == 0 && same_length, - }; - matches.entry(di.document) - .or_insert_with(Vec::new) - .push(match_); - } - } - }, - None => { - let matches = mem::replace(matches, FnvHashMap::default()); - *self = RankedStreamInner::Pours { - inner: matches_into_iter(matches, *limit).into_iter() - }; - }, - } - }, - RankedStreamInner::Pours { inner } => { - return inner.next() - }, - } - } + pub fn from_sorted_matches(id: DocumentId, matches: Vec) -> Self { + Self { id, matches } } } diff --git a/src/rank/ranked_stream.rs b/src/rank/ranked_stream.rs new file mode 100644 index 000000000..e395eb4a9 --- /dev/null +++ b/src/rank/ranked_stream.rs @@ -0,0 +1,141 @@ +use std::cmp::Ordering; +use std::rc::Rc; +use std::{mem, vec}; + +use fnv::FnvHashMap; +use fst::Streamer; +use group_by::GroupByMut; + +use crate::automaton::{DfaExt, AutomatonExt}; +use crate::metadata::Metadata; +use crate::metadata::ops::{OpBuilder, Union}; +use crate::rank::criterion::Criteria; +use crate::rank::Document; +use crate::{Match, DocumentId}; + +pub struct Config<'m, F> { + pub criteria: Criteria, + pub metadata: &'m Metadata, + pub automatons: Vec, + pub limit: usize, +} + +pub struct RankedStream<'m, F>(RankedStreamInner<'m, F>); + +impl<'m, F> RankedStream<'m, F> { + pub fn new(config: Config<'m, F>) -> Self { + let automatons: Vec<_> = config.automatons.into_iter().map(Rc::new).collect(); + let mut builder = OpBuilder::with_automatons(automatons.clone()); + builder.push(config.metadata); + + let inner = RankedStreamInner::Fed { + inner: builder.union(), + automatons: automatons, + criteria: config.criteria, + limit: config.limit, + matches: FnvHashMap::default(), + }; + + RankedStream(inner) + } +} + +impl<'m, 'a, F> fst::Streamer<'a> for RankedStream<'m, F> +where F: Fn(&Document, &Document) -> Ordering + Copy, +{ + type Item = Document; + + fn next(&'a mut self) -> Option { + self.0.next() + } +} + +enum RankedStreamInner<'m, F> { + Fed { + inner: Union<'m>, + automatons: Vec>, + criteria: Criteria, + limit: usize, + matches: FnvHashMap>, + }, + Pours { + inner: vec::IntoIter, + }, +} + +impl<'m, 'a, F> fst::Streamer<'a> for RankedStreamInner<'m, F> +where F: Fn(&Document, &Document) -> Ordering + Copy, +{ + type Item = Document; + + fn next(&'a mut self) -> Option { + loop { + match self { + RankedStreamInner::Fed { inner, automatons, criteria, limit, matches } => { + match inner.next() { + Some((string, indexed_values)) => { + for iv in indexed_values { + let automaton = &automatons[iv.index]; + let distance = automaton.eval(string).to_u8(); + let same_length = string.len() == automaton.query_len(); + + for di in iv.doc_indexes.as_slice() { + let match_ = Match { + query_index: iv.index as u32, + distance: distance, + attribute: di.attribute, + attribute_index: di.attribute_index, + is_exact: distance == 0 && same_length, + }; + matches.entry(di.document) + .or_insert_with(Vec::new) + .push(match_); + } + } + }, + None => { + let matches = mem::replace(matches, FnvHashMap::default()); + let criteria = mem::replace(criteria, Criteria::new()); + *self = RankedStreamInner::Pours { + inner: matches_into_iter(matches, criteria, *limit).into_iter() + }; + }, + } + }, + RankedStreamInner::Pours { inner } => { + return inner.next() + }, + } + } + } +} + +fn matches_into_iter(matches: FnvHashMap>, + criteria: Criteria, + limit: usize) -> vec::IntoIter +where F: Fn(&Document, &Document) -> Ordering + Copy, +{ + let mut documents: Vec<_> = matches.into_iter().map(|(id, mut matches)| { + matches.sort_unstable(); + Document::from_sorted_matches(id, matches) + }).collect(); + + let mut groups = vec![documents.as_mut_slice()]; + + for sort in criteria { + let temp = mem::replace(&mut groups, Vec::new()); + let mut computed = 0; + + 'grp: for group in temp { + group.sort_unstable_by(sort); + for group in GroupByMut::new(group, |a, b| sort(a, b) == Ordering::Equal) { + computed += group.len(); + groups.push(group); + if computed >= limit { break 'grp } + } + } + } + + documents.truncate(limit); + documents.into_iter() +}