2019-10-08 21:22:36 +08:00
|
|
|
#[cfg(test)]
|
2019-10-18 19:05:28 +08:00
|
|
|
#[macro_use]
|
|
|
|
extern crate assert_matches;
|
2020-04-07 02:05:02 +08:00
|
|
|
#[macro_use]
|
|
|
|
extern crate pest_derive;
|
2019-10-08 21:22:36 +08:00
|
|
|
|
2019-10-02 23:34:32 +08:00
|
|
|
mod automaton;
|
2019-12-13 18:49:56 +08:00
|
|
|
mod bucket_sort;
|
2019-10-04 22:49:17 +08:00
|
|
|
mod database;
|
2019-10-09 19:44:18 +08:00
|
|
|
mod distinct_map;
|
2019-10-03 21:04:11 +08:00
|
|
|
mod error;
|
2020-04-07 02:05:02 +08:00
|
|
|
mod filters;
|
2019-10-31 00:25:42 +08:00
|
|
|
mod levenshtein;
|
2019-10-03 17:49:13 +08:00
|
|
|
mod number;
|
2019-10-02 23:34:32 +08:00
|
|
|
mod query_builder;
|
2020-01-08 00:40:58 +08:00
|
|
|
mod query_tree;
|
2020-01-13 20:29:47 +08:00
|
|
|
mod query_words_mapper;
|
2019-10-03 21:04:11 +08:00
|
|
|
mod ranked_map;
|
2019-10-02 23:34:32 +08:00
|
|
|
mod raw_document;
|
2019-10-18 19:05:28 +08:00
|
|
|
mod reordered_attrs;
|
2019-12-13 18:49:56 +08:00
|
|
|
pub mod criterion;
|
2020-05-06 04:19:34 +08:00
|
|
|
pub mod facets;
|
2019-12-13 18:49:56 +08:00
|
|
|
pub mod raw_indexer;
|
2019-10-03 17:49:13 +08:00
|
|
|
pub mod serde;
|
2020-05-19 20:11:48 +08:00
|
|
|
pub mod settings;
|
2019-10-02 23:34:32 +08:00
|
|
|
pub mod store;
|
2020-05-19 20:11:48 +08:00
|
|
|
pub mod update;
|
2019-11-30 23:53:34 +08:00
|
|
|
|
2020-04-29 06:40:06 +08:00
|
|
|
pub use self::database::{BoxUpdateFn, Database, DatabaseOptions, MainT, UpdateT};
|
2020-05-06 04:19:34 +08:00
|
|
|
pub use self::error::{Error, HeedError, FstError, MResult, pest_error, FacetError};
|
2020-04-07 02:05:02 +08:00
|
|
|
pub use self::filters::Filter;
|
2019-10-04 19:26:33 +08:00
|
|
|
pub use self::number::{Number, ParseNumberError};
|
|
|
|
pub use self::ranked_map::RankedMap;
|
2019-10-04 22:49:17 +08:00
|
|
|
pub use self::raw_document::RawDocument;
|
2019-10-04 19:26:33 +08:00
|
|
|
pub use self::store::Index;
|
2019-10-31 18:13:37 +08:00
|
|
|
pub use self::update::{EnqueuedUpdateResult, ProcessedUpdateResult, UpdateStatus, UpdateType};
|
2019-12-13 18:46:53 +08:00
|
|
|
pub use meilisearch_types::{DocIndex, DocumentId, Highlight};
|
2020-02-03 05:59:19 +08:00
|
|
|
pub use meilisearch_schema::Schema;
|
2020-01-13 20:29:47 +08:00
|
|
|
pub use query_words_mapper::QueryWordsMapper;
|
2019-10-02 23:34:32 +08:00
|
|
|
|
2019-12-12 00:36:53 +08:00
|
|
|
use compact_arena::SmallArena;
|
2020-02-03 05:59:19 +08:00
|
|
|
use log::{error, trace};
|
2020-05-22 21:00:50 +08:00
|
|
|
use std::borrow::Cow;
|
|
|
|
use std::collections::HashMap;
|
|
|
|
use std::convert::TryFrom;
|
2020-01-16 21:56:16 +08:00
|
|
|
|
2020-01-16 21:24:45 +08:00
|
|
|
use crate::bucket_sort::PostingsListView;
|
2019-12-12 00:36:53 +08:00
|
|
|
use crate::levenshtein::prefix_damerau_levenshtein;
|
2020-01-16 21:56:16 +08:00
|
|
|
use crate::query_tree::{QueryId, QueryKind};
|
2019-12-13 20:22:54 +08:00
|
|
|
use crate::reordered_attrs::ReorderedAttrs;
|
2019-12-12 00:36:53 +08:00
|
|
|
|
2020-05-22 21:00:50 +08:00
|
|
|
type FstSetCow<'a> = fst::Set<Cow<'a, [u8]>>;
|
|
|
|
type FstMapCow<'a> = fst::Map<Cow<'a, [u8]>>;
|
|
|
|
|
2019-12-13 18:14:12 +08:00
|
|
|
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
|
2019-10-02 23:34:32 +08:00
|
|
|
pub struct Document {
|
|
|
|
pub id: DocumentId,
|
|
|
|
pub highlights: Vec<Highlight>,
|
|
|
|
|
2019-12-13 18:14:12 +08:00
|
|
|
#[cfg(test)]
|
|
|
|
pub matches: Vec<crate::bucket_sort::SimpleMatch>,
|
2019-10-02 23:34:32 +08:00
|
|
|
}
|
|
|
|
|
2019-12-13 20:22:54 +08:00
|
|
|
fn highlights_from_raw_document<'a, 'tag, 'txn>(
|
|
|
|
raw_document: &RawDocument<'a, 'tag>,
|
2020-01-16 21:56:16 +08:00
|
|
|
queries_kinds: &HashMap<QueryId, &QueryKind>,
|
2019-12-13 20:22:54 +08:00
|
|
|
arena: &SmallArena<'tag, PostingsListView<'txn>>,
|
|
|
|
searchable_attrs: Option<&ReorderedAttrs>,
|
2020-01-14 02:34:49 +08:00
|
|
|
schema: &Schema,
|
2019-12-13 20:22:54 +08:00
|
|
|
) -> Vec<Highlight>
|
|
|
|
{
|
|
|
|
let mut highlights = Vec::new();
|
|
|
|
|
|
|
|
for bm in raw_document.bare_matches.iter() {
|
|
|
|
let postings_list = &arena[bm.postings_list];
|
|
|
|
let input = postings_list.input();
|
2020-01-16 21:56:16 +08:00
|
|
|
let kind = &queries_kinds.get(&bm.query_index);
|
2019-12-13 20:22:54 +08:00
|
|
|
|
|
|
|
for di in postings_list.iter() {
|
2020-01-16 21:56:16 +08:00
|
|
|
let covered_area = match kind {
|
2020-01-23 01:11:58 +08:00
|
|
|
Some(QueryKind::NonTolerant(query)) | Some(QueryKind::Tolerant(query)) => {
|
2020-01-16 21:56:16 +08:00
|
|
|
let len = if query.len() > input.len() {
|
|
|
|
input.len()
|
|
|
|
} else {
|
|
|
|
prefix_damerau_levenshtein(query.as_bytes(), input).1
|
|
|
|
};
|
|
|
|
u16::try_from(len).unwrap_or(u16::max_value())
|
|
|
|
},
|
|
|
|
_ => di.char_length,
|
|
|
|
};
|
2019-12-13 20:22:54 +08:00
|
|
|
|
|
|
|
let attribute = searchable_attrs
|
|
|
|
.and_then(|sa| sa.reverse(di.attribute))
|
|
|
|
.unwrap_or(di.attribute);
|
|
|
|
|
2020-01-14 02:34:49 +08:00
|
|
|
let attribute = match schema.indexed_pos_to_field_id(attribute) {
|
|
|
|
Some(field_id) => field_id.0,
|
|
|
|
None => {
|
|
|
|
error!("Cannot convert indexed_pos {} to field_id", attribute);
|
2020-01-30 01:30:21 +08:00
|
|
|
trace!("Schema is compromized; {:?}", schema);
|
2020-01-14 02:34:49 +08:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2019-12-13 20:22:54 +08:00
|
|
|
let highlight = Highlight {
|
2020-02-03 05:59:19 +08:00
|
|
|
attribute,
|
2019-12-13 20:22:54 +08:00
|
|
|
char_index: di.char_index,
|
2020-01-16 21:56:16 +08:00
|
|
|
char_length: covered_area,
|
2019-12-13 20:22:54 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
highlights.push(highlight);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
highlights
|
|
|
|
}
|
|
|
|
|
2019-12-12 00:36:53 +08:00
|
|
|
impl Document {
|
2019-12-21 20:44:19 +08:00
|
|
|
#[cfg(not(test))]
|
|
|
|
pub fn from_highlights(id: DocumentId, highlights: &[Highlight]) -> Document {
|
|
|
|
Document { id, highlights: highlights.to_owned() }
|
|
|
|
}
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
pub fn from_highlights(id: DocumentId, highlights: &[Highlight]) -> Document {
|
|
|
|
Document { id, highlights: highlights.to_owned(), matches: Vec::new() }
|
|
|
|
}
|
|
|
|
|
2019-12-13 20:22:54 +08:00
|
|
|
#[cfg(not(test))]
|
2019-12-12 00:36:53 +08:00
|
|
|
pub fn from_raw<'a, 'tag, 'txn>(
|
|
|
|
raw_document: RawDocument<'a, 'tag>,
|
2020-01-16 21:56:16 +08:00
|
|
|
queries_kinds: &HashMap<QueryId, &QueryKind>,
|
2019-12-12 00:36:53 +08:00
|
|
|
arena: &SmallArena<'tag, PostingsListView<'txn>>,
|
2019-12-13 20:22:54 +08:00
|
|
|
searchable_attrs: Option<&ReorderedAttrs>,
|
2020-01-14 02:34:49 +08:00
|
|
|
schema: &Schema,
|
2019-12-12 00:36:53 +08:00
|
|
|
) -> Document
|
|
|
|
{
|
2020-01-16 21:24:45 +08:00
|
|
|
let highlights = highlights_from_raw_document(
|
|
|
|
&raw_document,
|
2020-01-16 21:56:16 +08:00
|
|
|
queries_kinds,
|
2020-01-16 21:24:45 +08:00
|
|
|
arena,
|
|
|
|
searchable_attrs,
|
2020-01-14 02:34:49 +08:00
|
|
|
schema,
|
2020-01-16 21:24:45 +08:00
|
|
|
);
|
2019-12-13 20:22:54 +08:00
|
|
|
|
|
|
|
Document { id: raw_document.id, highlights }
|
|
|
|
}
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
pub fn from_raw<'a, 'tag, 'txn>(
|
|
|
|
raw_document: RawDocument<'a, 'tag>,
|
2020-01-16 21:56:16 +08:00
|
|
|
queries_kinds: &HashMap<QueryId, &QueryKind>,
|
2019-12-13 20:22:54 +08:00
|
|
|
arena: &SmallArena<'tag, PostingsListView<'txn>>,
|
|
|
|
searchable_attrs: Option<&ReorderedAttrs>,
|
2020-01-14 02:34:49 +08:00
|
|
|
schema: &Schema,
|
2019-12-13 20:22:54 +08:00
|
|
|
) -> Document
|
|
|
|
{
|
|
|
|
use crate::bucket_sort::SimpleMatch;
|
2019-12-13 18:14:12 +08:00
|
|
|
|
2020-01-16 21:24:45 +08:00
|
|
|
let highlights = highlights_from_raw_document(
|
|
|
|
&raw_document,
|
2020-01-16 21:56:16 +08:00
|
|
|
queries_kinds,
|
2020-01-16 21:24:45 +08:00
|
|
|
arena,
|
|
|
|
searchable_attrs,
|
2020-01-14 02:34:49 +08:00
|
|
|
schema,
|
2020-01-16 21:24:45 +08:00
|
|
|
);
|
2019-12-13 20:22:54 +08:00
|
|
|
|
|
|
|
let mut matches = Vec::new();
|
|
|
|
for sm in raw_document.processed_matches {
|
|
|
|
let attribute = searchable_attrs
|
|
|
|
.and_then(|sa| sa.reverse(sm.attribute))
|
|
|
|
.unwrap_or(sm.attribute);
|
|
|
|
|
2020-01-14 02:34:49 +08:00
|
|
|
let attribute = match schema.indexed_pos_to_field_id(attribute) {
|
|
|
|
Some(field_id) => field_id.0,
|
|
|
|
None => {
|
|
|
|
error!("Cannot convert indexed_pos {} to field_id", attribute);
|
2020-01-30 01:30:21 +08:00
|
|
|
trace!("Schema is compromized; {:?}", schema);
|
2020-01-14 02:34:49 +08:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2019-12-13 20:22:54 +08:00
|
|
|
matches.push(SimpleMatch { attribute, ..sm });
|
2019-12-13 18:14:12 +08:00
|
|
|
}
|
2019-12-13 20:22:54 +08:00
|
|
|
matches.sort_unstable();
|
|
|
|
|
|
|
|
Document { id: raw_document.id, highlights, matches }
|
2019-12-12 00:36:53 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-10-02 23:34:32 +08:00
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
|
|
|
use super::*;
|
|
|
|
use std::mem;
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn docindex_mem_size() {
|
2020-05-19 19:53:31 +08:00
|
|
|
assert_eq!(mem::size_of::<DocIndex>(), 12);
|
2019-10-02 23:34:32 +08:00
|
|
|
}
|
|
|
|
}
|