feat: Introduce a WordArea struct

Useful to highlight matching areas in the original text.
This commit is contained in:
Clément Renault 2018-12-23 16:46:49 +01:00
parent 62521262e8
commit b32c96cdc9
No known key found for this signature in database
GPG Key ID: 0151CDAB43460DAE
14 changed files with 373 additions and 136 deletions

View File

@ -35,5 +35,7 @@ nightly = []
[dev-dependencies] [dev-dependencies]
csv = "1.0" csv = "1.0"
elapsed = "0.1" elapsed = "0.1"
quickcheck = "0.7"
structopt = "0.2" structopt = "0.2"
tempfile = "3.0" tempfile = "3.0"
termcolor = "1.0"

View File

@ -2,10 +2,12 @@ use std::io::{self, Write};
use std::path::PathBuf; use std::path::PathBuf;
use std::error::Error; use std::error::Error;
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
use serde_derive::{Serialize, Deserialize}; use serde_derive::{Serialize, Deserialize};
use structopt::StructOpt; use structopt::StructOpt;
use meilidb::database::Database; use meilidb::database::Database;
use meilidb::Match;
#[derive(Debug, StructOpt)] #[derive(Debug, StructOpt)]
pub struct Opt { pub struct Opt {
@ -26,6 +28,40 @@ struct Document {
image: String, image: String,
} }
fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> {
let mut stdout = StandardStream::stdout(ColorChoice::Always);
let mut highlighted = false;
for range in ranges.windows(2) {
let [start, end] = match range { [start, end] => [*start, *end], _ => unreachable!() };
if highlighted {
stdout.set_color(ColorSpec::new().set_fg(Some(Color::Yellow)))?;
}
write!(&mut stdout, "{}", &text[start..end])?;
stdout.reset()?;
highlighted = !highlighted;
}
Ok(())
}
fn create_highlight_areas(text: &str, matches: &[Match], attribute: u16) -> Vec<usize> {
let mut title_areas = Vec::new();
title_areas.push(0);
for match_ in matches {
if match_.attribute.attribute() == attribute {
let word_area = match_.word_area;
let byte_index = word_area.byte_index() as usize;
let length = word_area.length() as usize;
title_areas.push(byte_index);
title_areas.push(byte_index + length);
}
}
title_areas.push(text.len());
title_areas
}
fn main() -> Result<(), Box<Error>> { fn main() -> Result<(), Box<Error>> {
let opt = Opt::from_args(); let opt = Opt::from_args();
@ -41,26 +77,35 @@ fn main() -> Result<(), Box<Error>> {
io::stdout().flush()?; io::stdout().flush()?;
if input.read_line(&mut buffer)? == 0 { break } if input.read_line(&mut buffer)? == 0 { break }
let query = buffer.trim_end_matches('\n');
let view = database.view(); let view = database.view();
let (elapsed, documents) = elapsed::measure_time(|| { let (elapsed, documents) = elapsed::measure_time(|| {
let builder = view.query_builder().unwrap(); let builder = view.query_builder().unwrap();
builder.query(&buffer, 0..opt.number_results) builder.query(query, 0..opt.number_results)
}); });
let mut full_documents = Vec::with_capacity(documents.len()); let number_of_documents = documents.len();
for doc in documents {
match view.retrieve_document::<Document>(doc.id) {
Ok(document) => {
for document in documents { print!("title: ");
match view.retrieve_document::<Document>(document.id) { let title_areas = create_highlight_areas(&document.title, &doc.matches, 1);
Ok(document) => full_documents.push(document), display_highlights(&document.title, &title_areas)?;
println!();
print!("description: ");
let description_areas = create_highlight_areas(&document.description, &doc.matches, 2);
display_highlights(&document.description, &description_areas)?;
println!();
},
Err(e) => eprintln!("{}", e), Err(e) => eprintln!("{}", e),
} }
} }
println!("{:#?}", full_documents); println!("Found {} results in {}", number_of_documents, elapsed);
println!("Found {} results in {}", full_documents.len(), elapsed);
buffer.clear(); buffer.clear();
} }

View File

@ -158,14 +158,15 @@ mod tests {
use super::*; use super::*;
use std::error::Error; use std::error::Error;
use crate::{Attribute, WordArea};
use crate::DocumentId; use crate::DocumentId;
#[test] #[test]
fn builder_serialize_deserialize() -> Result<(), Box<Error>> { fn builder_serialize_deserialize() -> Result<(), Box<Error>> {
let a = DocIndex { document_id: DocumentId(0), attribute: 3, attribute_index: 11 }; let a = DocIndex { document_id: DocumentId(0), attribute: Attribute::new(3, 11), word_area: WordArea::new(30, 4) };
let b = DocIndex { document_id: DocumentId(1), attribute: 4, attribute_index: 21 }; let b = DocIndex { document_id: DocumentId(1), attribute: Attribute::new(4, 21), word_area: WordArea::new(35, 6) };
let c = DocIndex { document_id: DocumentId(2), attribute: 8, attribute_index: 2 }; let c = DocIndex { document_id: DocumentId(2), attribute: Attribute::new(8, 2), word_area: WordArea::new(89, 6) };
let mut builder = DocIndexesBuilder::memory(); let mut builder = DocIndexesBuilder::memory();
@ -186,9 +187,9 @@ mod tests {
#[test] #[test]
fn serialize_deserialize() -> Result<(), Box<Error>> { fn serialize_deserialize() -> Result<(), Box<Error>> {
let a = DocIndex { document_id: DocumentId(0), attribute: 3, attribute_index: 11 }; let a = DocIndex { document_id: DocumentId(0), attribute: Attribute::new(3, 11), word_area: WordArea::new(30, 4) };
let b = DocIndex { document_id: DocumentId(1), attribute: 4, attribute_index: 21 }; let b = DocIndex { document_id: DocumentId(1), attribute: Attribute::new(4, 21), word_area: WordArea::new(35, 6) };
let c = DocIndex { document_id: DocumentId(2), attribute: 8, attribute_index: 2 }; let c = DocIndex { document_id: DocumentId(2), attribute: Attribute::new(8, 2), word_area: WordArea::new(89, 6) };
let mut builder = DocIndexesBuilder::memory(); let mut builder = DocIndexesBuilder::memory();

View File

@ -203,14 +203,15 @@ mod tests {
use super::*; use super::*;
use std::error::Error; use std::error::Error;
use crate::{Attribute, WordArea};
use crate::DocumentId; use crate::DocumentId;
#[test] #[test]
fn serialize_deserialize() -> Result<(), Box<Error>> { fn serialize_deserialize() -> Result<(), Box<Error>> {
let a = DocIndex { document_id: DocumentId(0), attribute: 3, attribute_index: 11 }; let a = DocIndex { document_id: DocumentId(0), attribute: Attribute::new(3, 11), word_area: WordArea::new(30, 4) };
let b = DocIndex { document_id: DocumentId(1), attribute: 4, attribute_index: 21 }; let b = DocIndex { document_id: DocumentId(1), attribute: Attribute::new(4, 21), word_area: WordArea::new(35, 6) };
let c = DocIndex { document_id: DocumentId(2), attribute: 8, attribute_index: 2 }; let c = DocIndex { document_id: DocumentId(2), attribute: Attribute::new(8, 2), word_area: WordArea::new(89, 6) };
let mut builder = PositiveBlobBuilder::memory(); let mut builder = PositiveBlobBuilder::memory();
@ -231,9 +232,9 @@ mod tests {
#[test] #[test]
fn serde_serialize_deserialize() -> Result<(), Box<Error>> { fn serde_serialize_deserialize() -> Result<(), Box<Error>> {
let a = DocIndex { document_id: DocumentId(0), attribute: 3, attribute_index: 11 }; let a = DocIndex { document_id: DocumentId(0), attribute: Attribute::new(3, 11), word_area: WordArea::new(30, 4) };
let b = DocIndex { document_id: DocumentId(1), attribute: 4, attribute_index: 21 }; let b = DocIndex { document_id: DocumentId(1), attribute: Attribute::new(4, 21), word_area: WordArea::new(35, 6) };
let c = DocIndex { document_id: DocumentId(2), attribute: 8, attribute_index: 2 }; let c = DocIndex { document_id: DocumentId(2), attribute: Attribute::new(8, 2), word_area: WordArea::new(89, 6) };
let mut builder = PositiveBlobBuilder::memory(); let mut builder = PositiveBlobBuilder::memory();

View File

@ -73,7 +73,7 @@ impl DocumentKeyAttr {
let mut wtr = Cursor::new(&mut buffer[..]); let mut wtr = Cursor::new(&mut buffer[..]);
wtr.write_all(&raw_key).unwrap(); wtr.write_all(&raw_key).unwrap();
wtr.write_all(b"-").unwrap(); wtr.write_all(b"-").unwrap();
wtr.write_u32::<NativeEndian>(attr.as_u32()).unwrap(); wtr.write_u16::<NativeEndian>(attr.0).unwrap();
DocumentKeyAttr(buffer) DocumentKeyAttr(buffer)
} }
@ -95,7 +95,7 @@ impl DocumentKeyAttr {
pub fn attribute(&self) -> SchemaAttr { pub fn attribute(&self) -> SchemaAttr {
let offset = 4 + size_of::<u64>() + 1; let offset = 4 + size_of::<u64>() + 1;
let value = (&self.0[offset..]).read_u32::<NativeEndian>().unwrap(); let value = (&self.0[offset..]).read_u16::<NativeEndian>().unwrap();
SchemaAttr::new(value) SchemaAttr::new(value)
} }
@ -114,7 +114,7 @@ impl fmt::Debug for DocumentKeyAttr {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("DocumentKeyAttr") f.debug_struct("DocumentKeyAttr")
.field("document_id", &self.document_id()) .field("document_id", &self.document_id())
.field("attribute", &self.attribute().as_u32()) .field("attribute", &self.attribute().0)
.finish() .finish()
} }
} }

View File

@ -1,6 +1,6 @@
use std::collections::{HashMap, BTreeMap}; use std::collections::{HashMap, BTreeMap};
use std::io::{Read, Write}; use std::io::{Read, Write};
use std::{fmt, u32}; use std::{fmt, u16};
use std::path::Path; use std::path::Path;
use std::ops::BitOr; use std::ops::BitOr;
use std::sync::Arc; use std::sync::Arc;
@ -53,7 +53,7 @@ impl SchemaBuilder {
if self.attrs.insert(name.into(), props).is_some() { if self.attrs.insert(name.into(), props).is_some() {
panic!("Field already inserted.") panic!("Field already inserted.")
} }
SchemaAttr(len as u32) SchemaAttr(len as u16)
} }
pub fn build(self) -> Schema { pub fn build(self) -> Schema {
@ -61,7 +61,7 @@ impl SchemaBuilder {
let mut props = Vec::new(); let mut props = Vec::new();
for (i, (name, prop)) in self.attrs.into_iter().enumerate() { for (i, (name, prop)) in self.attrs.into_iter().enumerate() {
attrs.insert(name.clone(), SchemaAttr(i as u32)); attrs.insert(name.clone(), SchemaAttr(i as u16));
props.push((name, prop)); props.push((name, prop));
} }
@ -94,10 +94,9 @@ impl Schema {
pub fn write_to<W: Write>(&self, writer: W) -> bincode::Result<()> { pub fn write_to<W: Write>(&self, writer: W) -> bincode::Result<()> {
let mut ordered = BTreeMap::new(); let mut ordered = BTreeMap::new();
for (name, field) in &self.inner.attrs { for (name, attr) in &self.inner.attrs {
let index = field.as_u32(); let (_, props) = self.inner.props[attr.0 as usize];
let (_, props) = self.inner.props[index as usize]; ordered.insert(attr.0, (name, props));
ordered.insert(index, (name, props));
} }
let mut attrs = LinkedHashMap::with_capacity(ordered.len()); let mut attrs = LinkedHashMap::with_capacity(ordered.len());
@ -109,8 +108,7 @@ impl Schema {
} }
pub fn props(&self, attr: SchemaAttr) -> SchemaProps { pub fn props(&self, attr: SchemaAttr) -> SchemaProps {
let index = attr.as_u32(); let (_, props) = self.inner.props[attr.0 as usize];
let (_, props) = self.inner.props[index as usize];
props props
} }
@ -119,26 +117,21 @@ impl Schema {
} }
pub fn attribute_name(&self, attr: SchemaAttr) -> &str { pub fn attribute_name(&self, attr: SchemaAttr) -> &str {
let index = attr.as_u32(); let (name, _) = &self.inner.props[attr.0 as usize];
let (name, _) = &self.inner.props[index as usize];
name name
} }
} }
#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq)] #[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq)]
pub struct SchemaAttr(u32); pub struct SchemaAttr(pub(crate) u16);
impl SchemaAttr { impl SchemaAttr {
pub fn new(value: u32) -> SchemaAttr { pub fn new(value: u16) -> SchemaAttr {
SchemaAttr(value) SchemaAttr(value)
} }
pub fn max() -> SchemaAttr { pub fn max() -> SchemaAttr {
SchemaAttr(u32::MAX) SchemaAttr(u16::MAX)
}
pub fn as_u32(&self) -> u32 {
self.0
} }
} }

View File

@ -9,12 +9,12 @@ use serde::ser::{self, Serialize};
use crate::database::update::positive::unordered_builder::UnorderedPositiveBlobBuilder; use crate::database::update::positive::unordered_builder::UnorderedPositiveBlobBuilder;
use crate::database::blob::positive::PositiveBlob; use crate::database::blob::positive::PositiveBlob;
use crate::database::schema::{Schema, SchemaAttr}; use crate::database::schema::{Schema, SchemaAttr};
use crate::tokenizer::TokenizerBuilder; use crate::tokenizer::{TokenizerBuilder, Token};
use crate::database::DocumentKeyAttr; use crate::database::DocumentKeyAttr;
use crate::database::update::Update; use crate::database::update::Update;
use crate::{DocumentId, DocIndex};
use crate::database::DATA_INDEX; use crate::database::DATA_INDEX;
use crate::database::blob::Blob; use crate::database::blob::Blob;
use crate::{DocumentId, DocIndex, Attribute, WordArea};
pub enum NewState { pub enum NewState {
Updated { value: Vec<u8> }, Updated { value: Vec<u8> },
@ -355,11 +355,11 @@ where B: TokenizerBuilder
} }
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> { fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
for (index, word) in self.tokenizer_builder.build(v) { for Token { word, word_index, char_index } in self.tokenizer_builder.build(v) {
let doc_index = DocIndex { let doc_index = DocIndex {
document_id: self.document_id, document_id: self.document_id,
attribute: self.attribute.as_u32() as u8, attribute: Attribute::new(self.attribute.0, word_index as u32),
attribute_index: index as u32, word_area: WordArea::new(char_index as u32, word.len() as u16),
}; };
// insert the exact representation // insert the exact representation

View File

@ -6,6 +6,8 @@ pub mod tokenizer;
pub mod vec_read_only; pub mod vec_read_only;
mod common_words; mod common_words;
use std::fmt;
pub use rocksdb; pub use rocksdb;
pub use self::tokenizer::Tokenizer; pub use self::tokenizer::Tokenizer;
@ -18,28 +20,110 @@ pub use self::common_words::CommonWords;
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] #[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
pub struct DocumentId(pub u64); pub struct DocumentId(pub u64);
/// Represent an attribute number along with the word index
/// according to the tokenizer used.
///
/// It can accept up to 1024 attributes and word positions
/// can be maximum 2^22.
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Attribute(u32);
impl Attribute {
/// Construct an `Attribute` from an attribute number and
/// the word position of a match according to the tokenizer used.
///
/// # Panics
///
/// The attribute must not be greater than 1024
/// and the word index not greater than 2^22.
fn new(attribute: u16, index: u32) -> Attribute {
assert!(attribute & 0b1111_1100_0000_0000 == 0);
assert!(index & 0b1111_1111_1100_0000_0000_0000_0000 == 0);
let attribute = (attribute as u32) << 22;
Attribute(attribute | index)
}
pub fn attribute(&self) -> u16 {
(self.0 >> 22) as u16
}
pub fn word_index(&self) -> u32 {
self.0 & 0b0000_0000_0011_1111_1111_1111_1111
}
}
impl fmt::Debug for Attribute {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("Attribute")
.field("attribute", &self.attribute())
.field("word_index", &self.word_index())
.finish()
}
}
/// Represent a word position in bytes along with the length of it.
///
/// It can represent words byte index to maximum 2^22 and
/// up to words of length 1024.
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct WordArea(u32);
impl WordArea {
/// Construct a `WordArea` from a word position in bytes
/// and the length of it.
///
/// # Panics
///
/// The byte index must not be greater than 2^22
/// and the length not greater than 1024.
fn new(byte_index: u32, length: u16) -> WordArea {
assert!(byte_index & 0b1111_1111_1100_0000_0000_0000_0000 == 0);
assert!(length & 0b1111_1100_0000_0000 == 0);
let byte_index = byte_index << 10;
WordArea(byte_index | (length as u32))
}
pub fn byte_index(&self) -> u32 {
self.0 >> 10
}
pub fn length(&self) -> u16 {
(self.0 & 0b0000_0000_0000_0000_0011_1111_1111) as u16
}
}
impl fmt::Debug for WordArea {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("WordArea")
.field("byte_index", &self.byte_index())
.field("length", &self.length())
.finish()
}
}
/// This structure represent the position of a word /// This structure represent the position of a word
/// in a document and its attributes. /// in a document and its attributes.
/// ///
/// This is stored in the map, generated at index time, /// This is stored in the map, generated at index time,
/// extracted and interpreted at search time. /// extracted and interpreted at search time.
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[repr(C)] #[repr(C)]
pub struct DocIndex { pub struct DocIndex {
/// The document identifier where the word was found. /// The document identifier where the word was found.
pub document_id: DocumentId, pub document_id: DocumentId,
/// The attribute identifier in the document /// The attribute in the document where the word was found
/// where the word was found. /// along with the index in it.
/// pub attribute: Attribute,
/// This is an `u8` therefore a document
/// can not have more than `2^8` attributes.
pub attribute: u8,
/// The index where the word was found in the attribute. /// The position in bytes where the word was found
/// along with the length of it.
/// ///
/// Only the first 1000 words are indexed. /// It informs on the original word area in the text indexed
pub attribute_index: u32, /// without needing to run the tokenizer again.
pub word_area: WordArea,
} }
/// This structure represent a matching word with informations /// This structure represent a matching word with informations
@ -50,7 +134,7 @@ pub struct DocIndex {
/// ///
/// The word in itself is not important. /// The word in itself is not important.
// TODO do data oriented programming ? very arrays ? // TODO do data oriented programming ? very arrays ?
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Match { pub struct Match {
/// The word index in the query sentence. /// The word index in the query sentence.
/// Same as the `attribute_index` but for the query words. /// Same as the `attribute_index` but for the query words.
@ -62,23 +146,19 @@ pub struct Match {
/// (i.e. the Levenshtein distance). /// (i.e. the Levenshtein distance).
pub distance: u8, pub distance: u8,
/// The attribute in which the word is located /// The attribute in the document where the word was found
/// (i.e. Title is 0, Description is 1). /// along with the index in it.
/// pub attribute: Attribute,
/// This is an `u8` therefore a document
/// can not have more than `2^8` attributes.
pub attribute: u8,
/// Where does this word is located in the attribute string
/// (i.e. at the start or the end of the attribute).
///
/// The index in the attribute is limited to a maximum of `2^32`
/// this is because we index only the first 1000 words
/// in an attribute.
pub attribute_index: u32,
/// Whether the word that match is an exact match or a prefix. /// Whether the word that match is an exact match or a prefix.
pub is_exact: bool, pub is_exact: bool,
/// The position in bytes where the word was found
/// along with the length of it.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub word_area: WordArea,
} }
impl Match { impl Match {
@ -86,9 +166,9 @@ impl Match {
Match { Match {
query_index: 0, query_index: 0,
distance: 0, distance: 0,
attribute: 0, attribute: Attribute::new(0, 0),
attribute_index: 0,
is_exact: false, is_exact: false,
word_area: WordArea::new(0, 0),
} }
} }
@ -96,9 +176,71 @@ impl Match {
Match { Match {
query_index: u32::max_value(), query_index: u32::max_value(),
distance: u8::max_value(), distance: u8::max_value(),
attribute: u8::max_value(), attribute: Attribute(u32::max_value()),
attribute_index: u32::max_value(),
is_exact: true, is_exact: true,
word_area: WordArea(u32::max_value()),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use quickcheck::{quickcheck, TestResult};
use std::mem;
#[test]
fn docindex_mem_size() {
assert_eq!(mem::size_of::<DocIndex>(), 16);
}
quickcheck! {
fn qc_attribute(gen_attr: u16, gen_index: u32) -> TestResult {
if gen_attr > 2_u16.pow(10) || gen_index > 2_u32.pow(22) {
return TestResult::discard()
}
let attribute = Attribute::new(gen_attr, gen_index);
let valid_attribute = attribute.attribute() == gen_attr;
let valid_index = attribute.word_index() == gen_index;
TestResult::from_bool(valid_attribute && valid_index)
}
fn qc_attribute_ord(gen_attr: u16, gen_index: u32) -> TestResult {
if gen_attr >= 2_u16.pow(10) || gen_index >= 2_u32.pow(22) {
return TestResult::discard()
}
let a = Attribute::new(gen_attr, gen_index);
let b = Attribute::new(gen_attr + 1, gen_index + 1);
TestResult::from_bool(a < b)
}
fn qc_word_area(gen_byte_index: u32, gen_length: u16) -> TestResult {
if gen_byte_index > 2_u32.pow(22) || gen_length > 2_u16.pow(10) {
return TestResult::discard()
}
let word_area = WordArea::new(gen_byte_index, gen_length);
let valid_char_index = word_area.byte_index() == gen_byte_index;
let valid_length = word_area.length() == gen_length;
TestResult::from_bool(valid_char_index && valid_length)
}
fn qc_word_area_ord(gen_byte_index: u32, gen_length: u16) -> TestResult {
if gen_byte_index >= 2_u32.pow(22) || gen_length >= 2_u16.pow(10) {
return TestResult::discard()
}
let a = WordArea::new(gen_byte_index, gen_length);
let b = WordArea::new(gen_byte_index + 1, gen_length + 1);
TestResult::from_bool(a < b)
} }
} }
} }

View File

@ -44,7 +44,7 @@ where D: Deref<Target=DB>
mod tests { mod tests {
use super::*; use super::*;
use crate::DocumentId; use crate::{DocumentId, Attribute, WordArea};
// typing: "Geox CEO" // typing: "Geox CEO"
// //
@ -54,8 +54,8 @@ mod tests {
fn one_typo_reference() { fn one_typo_reference() {
let doc0 = { let doc0 = {
let matches = vec![ let matches = vec![
Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false }, Match { query_index: 0, distance: 0, attribute: Attribute::new(0, 0), is_exact: false, word_area: WordArea::new(0, 6) },
Match { query_index: 1, distance: 0, attribute: 0, attribute_index: 2, is_exact: false }, Match { query_index: 1, distance: 0, attribute: Attribute::new(0, 2), is_exact: false, word_area: WordArea::new(0, 6) },
]; ];
Document { Document {
id: DocumentId(0), id: DocumentId(0),
@ -65,8 +65,8 @@ mod tests {
let doc1 = { let doc1 = {
let matches = vec![ let matches = vec![
Match { query_index: 0, distance: 1, attribute: 0, attribute_index: 0, is_exact: false }, Match { query_index: 0, distance: 1, attribute: Attribute::new(0, 0), is_exact: false, word_area: WordArea::new(0, 6) },
Match { query_index: 1, distance: 0, attribute: 0, attribute_index: 2, is_exact: false }, Match { query_index: 1, distance: 0, attribute: Attribute::new(0, 2), is_exact: false, word_area: WordArea::new(0, 6) },
]; ];
Document { Document {
id: DocumentId(1), id: DocumentId(1),
@ -87,8 +87,8 @@ mod tests {
fn no_typo() { fn no_typo() {
let doc0 = { let doc0 = {
let matches = vec![ let matches = vec![
Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false }, Match { query_index: 0, distance: 0, attribute: Attribute::new(0, 0), is_exact: false, word_area: WordArea::new(0, 6) },
Match { query_index: 1, distance: 0, attribute: 0, attribute_index: 1, is_exact: false }, Match { query_index: 1, distance: 0, attribute: Attribute::new(0, 1), is_exact: false, word_area: WordArea::new(0, 6) },
]; ];
Document { Document {
id: DocumentId(0), id: DocumentId(0),
@ -98,7 +98,7 @@ mod tests {
let doc1 = { let doc1 = {
let matches = vec![ let matches = vec![
Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false }, Match { query_index: 0, distance: 0, attribute: Attribute::new(0, 0), is_exact: false, word_area: WordArea::new(0, 6) },
]; ];
Document { Document {
id: DocumentId(1), id: DocumentId(1),
@ -119,8 +119,8 @@ mod tests {
fn one_typo() { fn one_typo() {
let doc0 = { let doc0 = {
let matches = vec![ let matches = vec![
Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false }, Match { query_index: 0, distance: 0, attribute: Attribute::new(0, 0), is_exact: false, word_area: WordArea::new(0, 6) },
Match { query_index: 1, distance: 1, attribute: 0, attribute_index: 1, is_exact: false }, Match { query_index: 1, distance: 1, attribute: Attribute::new(0, 1), is_exact: false, word_area: WordArea::new(0, 6) },
]; ];
Document { Document {
id: DocumentId(0), id: DocumentId(0),
@ -130,7 +130,7 @@ mod tests {
let doc1 = { let doc1 = {
let matches = vec![ let matches = vec![
Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false }, Match { query_index: 0, distance: 0, attribute: Attribute::new(0, 0), is_exact: false, word_area: WordArea::new(0, 6) },
]; ];
Document { Document {
id: DocumentId(1), id: DocumentId(1),

View File

@ -10,11 +10,11 @@ use crate::rank::criterion::Criterion;
use crate::Match; use crate::Match;
#[inline] #[inline]
fn sum_matches_attributes(matches: &[Match]) -> u8 { fn sum_matches_attributes(matches: &[Match]) -> u16 {
// note that GroupBy will never return an empty group // note that GroupBy will never return an empty group
// so we can do this assumption safely // so we can do this assumption safely
GroupBy::new(matches, match_query_index).map(|group| unsafe { GroupBy::new(matches, match_query_index).map(|group| unsafe {
group.get_unchecked(0).attribute group.get_unchecked(0).attribute.attribute()
}).sum() }).sum()
} }

View File

@ -14,7 +14,7 @@ fn sum_matches_attribute_index(matches: &[Match]) -> u32 {
// note that GroupBy will never return an empty group // note that GroupBy will never return an empty group
// so we can do this assumption safely // so we can do this assumption safely
GroupBy::new(matches, match_query_index).map(|group| unsafe { GroupBy::new(matches, match_query_index).map(|group| unsafe {
group.get_unchecked(0).attribute_index group.get_unchecked(0).attribute.word_index()
}).sum() }).sum()
} }

View File

@ -20,8 +20,8 @@ fn index_proximity(lhs: u32, rhs: u32) -> u32 {
} }
fn attribute_proximity(lhs: &Match, rhs: &Match) -> u32 { fn attribute_proximity(lhs: &Match, rhs: &Match) -> u32 {
if lhs.attribute != rhs.attribute { return MAX_DISTANCE } if lhs.attribute.attribute() != rhs.attribute.attribute() { return MAX_DISTANCE }
index_proximity(lhs.attribute_index, rhs.attribute_index) index_proximity(lhs.attribute.word_index(), rhs.attribute.word_index())
} }
fn min_proximity(lhs: &[Match], rhs: &[Match]) -> u32 { fn min_proximity(lhs: &[Match], rhs: &[Match]) -> u32 {
@ -67,6 +67,8 @@ where D: Deref<Target=DB>
mod tests { mod tests {
use super::*; use super::*;
use crate::Attribute;
#[test] #[test]
fn three_different_attributes() { fn three_different_attributes() {
@ -79,11 +81,11 @@ mod tests {
// { id: 3, attr: 3, attr_index: 1 } // { id: 3, attr: 3, attr_index: 1 }
let matches = &[ let matches = &[
Match { query_index: 0, attribute: 0, attribute_index: 0, ..Match::zero() }, Match { query_index: 0, attribute: Attribute::new(0, 0), ..Match::zero() },
Match { query_index: 1, attribute: 1, attribute_index: 0, ..Match::zero() }, Match { query_index: 1, attribute: Attribute::new(1, 0), ..Match::zero() },
Match { query_index: 2, attribute: 1, attribute_index: 1, ..Match::zero() }, Match { query_index: 2, attribute: Attribute::new(1, 1), ..Match::zero() },
Match { query_index: 2, attribute: 2, attribute_index: 0, ..Match::zero() }, Match { query_index: 2, attribute: Attribute::new(2, 0), ..Match::zero() },
Match { query_index: 3, attribute: 3, attribute_index: 1, ..Match::zero() }, Match { query_index: 3, attribute: Attribute::new(3, 1), ..Match::zero() },
]; ];
// soup -> of = 8 // soup -> of = 8
@ -105,12 +107,12 @@ mod tests {
// { id: 3, attr: 1, attr_index: 3 } // { id: 3, attr: 1, attr_index: 3 }
let matches = &[ let matches = &[
Match { query_index: 0, attribute: 0, attribute_index: 0, ..Match::zero() }, Match { query_index: 0, attribute: Attribute::new(0, 0), ..Match::zero() },
Match { query_index: 0, attribute: 1, attribute_index: 0, ..Match::zero() }, Match { query_index: 0, attribute: Attribute::new(1, 0), ..Match::zero() },
Match { query_index: 1, attribute: 1, attribute_index: 1, ..Match::zero() }, Match { query_index: 1, attribute: Attribute::new(1, 1), ..Match::zero() },
Match { query_index: 2, attribute: 1, attribute_index: 2, ..Match::zero() }, Match { query_index: 2, attribute: Attribute::new(1, 2), ..Match::zero() },
Match { query_index: 3, attribute: 0, attribute_index: 1, ..Match::zero() }, Match { query_index: 3, attribute: Attribute::new(0, 1), ..Match::zero() },
Match { query_index: 3, attribute: 1, attribute_index: 3, ..Match::zero() }, Match { query_index: 3, attribute: Attribute::new(1, 3), ..Match::zero() },
]; ];
// soup -> of = 1 // soup -> of = 1

View File

@ -97,8 +97,8 @@ where D: Deref<Target=DB>
query_index: iv.index as u32, query_index: iv.index as u32,
distance: distance, distance: distance,
attribute: doc_index.attribute, attribute: doc_index.attribute,
attribute_index: doc_index.attribute_index,
is_exact: is_exact, is_exact: is_exact,
word_area: doc_index.word_area,
}; };
matches.entry(doc_index.document_id).or_insert_with(Vec::new).push(match_); matches.entry(doc_index.document_id).or_insert_with(Vec::new).push(match_);
} }

View File

@ -2,7 +2,7 @@ use std::mem;
use self::Separator::*; use self::Separator::*;
pub trait TokenizerBuilder { pub trait TokenizerBuilder {
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=(usize, &'a str)> + 'a>; fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=Token<'a>> + 'a>;
} }
pub struct DefaultBuilder; pub struct DefaultBuilder;
@ -13,22 +13,39 @@ impl DefaultBuilder {
} }
} }
#[derive(Debug, PartialEq, Eq)]
pub struct Token<'a> {
pub word: &'a str,
pub word_index: usize,
pub char_index: usize,
}
impl TokenizerBuilder for DefaultBuilder { impl TokenizerBuilder for DefaultBuilder {
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=(usize, &'a str)> + 'a> { fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=Token<'a>> + 'a> {
Box::new(Tokenizer::new(text)) Box::new(Tokenizer::new(text))
} }
} }
pub struct Tokenizer<'a> { pub struct Tokenizer<'a> {
index: usize, word_index: usize,
char_index: usize,
inner: &'a str, inner: &'a str,
} }
impl<'a> Tokenizer<'a> { impl<'a> Tokenizer<'a> {
pub fn new(string: &str) -> Tokenizer { pub fn new(string: &str) -> Tokenizer {
let mut char_advance = 0;
let mut index_advance = 0;
for (n, (i, c)) in string.char_indices().enumerate() {
char_advance = n;
index_advance = i;
if detect_separator(c).is_none() { break }
}
Tokenizer { Tokenizer {
index: 0, word_index: 0,
inner: string.trim_matches(&[' ', '.', ';', ',', '!', '?', '-', '\'', '"'][..]), char_index: char_advance,
inner: &string[index_advance..],
} }
} }
} }
@ -56,43 +73,58 @@ impl Separator {
} }
} }
fn detect_separator(c: char) -> Option<Separator> {
match c {
'.' | ';' | ',' | '!' | '?' | '-' => Some(Long),
' ' | '\'' | '"' => Some(Short),
_ => None,
}
}
impl<'a> Iterator for Tokenizer<'a> { impl<'a> Iterator for Tokenizer<'a> {
type Item = (usize, &'a str); type Item = Token<'a>;
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
let mut start_word = None; let mut start_word = None;
let mut distance = None; let mut distance = None;
for (i, c) in self.inner.char_indices() { for (i, c) in self.inner.char_indices() {
let separator = match c { match detect_separator(c) {
'.' | ';' | ',' | '!' | '?' | '-' => Some(Long), Some(sep) => {
' ' | '\'' | '"' => Some(Short),
_ => None,
};
match separator {
Some(dist) => {
if let Some(start_word) = start_word { if let Some(start_word) = start_word {
let (word, tail) = self.inner.split_at(i); let (prefix, tail) = self.inner.split_at(i);
let (spaces, word) = prefix.split_at(start_word);
self.inner = tail; self.inner = tail;
self.index += distance.map(Separator::to_usize).unwrap_or(0); self.char_index += spaces.len();
self.word_index += distance.map(Separator::to_usize).unwrap_or(0);
let word = &word[start_word..]; let token = Token {
return Some((self.index, word)) word: word,
word_index: self.word_index,
char_index: self.char_index,
};
self.char_index += word.len();
return Some(token)
} }
distance = Some(distance.map(|s| s.add(dist)).unwrap_or(dist));
distance.replace(distance.map_or(sep, |s| s.add(sep)));
}, },
None => { start_word.get_or_insert(i); }, None => { start_word.get_or_insert(i); },
} }
} }
if let Some(start_word) = start_word { if let Some(start_word) = start_word {
let word = mem::replace(&mut self.inner, ""); let prefix = mem::replace(&mut self.inner, "");
self.index += distance.map(Separator::to_usize).unwrap_or(0); let (spaces, word) = prefix.split_at(start_word);
let word = &word[start_word..]; let token = Token {
return Some((self.index, word)) word: word,
word_index: self.word_index + distance.map(Separator::to_usize).unwrap_or(0),
char_index: self.char_index + spaces.len(),
};
return Some(token)
} }
None None
@ -107,12 +139,12 @@ mod tests {
fn easy() { fn easy() {
let mut tokenizer = Tokenizer::new("salut"); let mut tokenizer = Tokenizer::new("salut");
assert_eq!(tokenizer.next(), Some((0, "salut"))); assert_eq!(tokenizer.next(), Some(Token { word: "salut", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), None); assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo "); let mut tokenizer = Tokenizer::new("yo ");
assert_eq!(tokenizer.next(), Some((0, "yo"))); assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), None); assert_eq!(tokenizer.next(), None);
} }
@ -120,18 +152,37 @@ mod tests {
fn hard() { fn hard() {
let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe"); let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe");
assert_eq!(tokenizer.next(), Some((0, "yo"))); assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
assert_eq!(tokenizer.next(), Some((1, "lolo"))); assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 }));
assert_eq!(tokenizer.next(), Some((9, "aïe"))); assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 }));
assert_eq!(tokenizer.next(), None); assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,"); let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");
assert_eq!(tokenizer.next(), Some((0, "yo"))); assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some((8, "lolo"))); assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
assert_eq!(tokenizer.next(), Some((16, "wtf"))); assert_eq!(tokenizer.next(), Some(Token { word: "wtf", word_index: 16, char_index: 12 }));
assert_eq!(tokenizer.next(), Some((24, "lol"))); assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 18 }));
assert_eq!(tokenizer.next(), Some((32, "aïe"))); assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 32, char_index: 24 }));
assert_eq!(tokenizer.next(), None);
}
#[test]
fn hard_long_chars() {
let mut tokenizer = Tokenizer::new(" .? yo 😂. aïe");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
assert_eq!(tokenizer.next(), Some(Token { word: "😂", word_index: 1, char_index: 7 }));
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 }));
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 19 }));
assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 32, char_index: 25 }));
assert_eq!(tokenizer.next(), None); assert_eq!(tokenizer.next(), None);
} }
} }