From e616b1e356417ef4daf4f263b870bbfd60aad1ce Mon Sep 17 00:00:00 2001 From: mpostma Date: Thu, 26 Nov 2020 10:18:36 +0100 Subject: [PATCH] hard separator offset --- meilisearch-core/src/database.rs | 8 +++--- meilisearch-core/src/query_tree.rs | 25 ++++++++++++++----- meilisearch-core/src/raw_indexer.rs | 38 +++++++++++++++++------------ 3 files changed, 45 insertions(+), 26 deletions(-) diff --git a/meilisearch-core/src/database.rs b/meilisearch-core/src/database.rs index 94563a5a9..da8d44d6a 100644 --- a/meilisearch-core/src/database.rs +++ b/meilisearch-core/src/database.rs @@ -193,8 +193,8 @@ fn version_guard(path: &Path, create: bool) -> MResult<(u32, u32, u32)> { Err(Error::VersionMismatch(format!("{}.{}.XX", version_major, version_minor))) } else { Ok(( - version_major.parse().map_err(|e| Error::VersionMismatch(format!("error parsing database version: {}", e)))?, - version_minor.parse().map_err(|e| Error::VersionMismatch(format!("error parsing database version: {}", e)))?, + version_major.parse().map_err(|e| Error::VersionMismatch(format!("error parsing database version: {}", e)))?, + version_minor.parse().map_err(|e| Error::VersionMismatch(format!("error parsing database version: {}", e)))?, version_patch.parse().map_err(|e| Error::VersionMismatch(format!("error parsing database version: {}", e)))? )) } @@ -212,8 +212,8 @@ fn version_guard(path: &Path, create: bool) -> MResult<(u32, u32, u32)> { current_version_patch).as_bytes())?; Ok(( - current_version_major.parse().map_err(|e| Error::VersionMismatch(format!("error parsing database version: {}", e)))?, - current_version_minor.parse().map_err(|e| Error::VersionMismatch(format!("error parsing database version: {}", e)))?, + current_version_major.parse().map_err(|e| Error::VersionMismatch(format!("error parsing database version: {}", e)))?, + current_version_minor.parse().map_err(|e| Error::VersionMismatch(format!("error parsing database version: {}", e)))?, current_version_patch.parse().map_err(|e| Error::VersionMismatch(format!("error parsing database version: {}", e)))? )) } else { diff --git a/meilisearch-core/src/query_tree.rs b/meilisearch-core/src/query_tree.rs index cb3921567..f16f431fa 100644 --- a/meilisearch-core/src/query_tree.rs +++ b/meilisearch-core/src/query_tree.rs @@ -1,5 +1,5 @@ use std::borrow::Cow; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::hash::{Hash, Hasher}; use std::ops::Range; use std::time::Instant; @@ -8,7 +8,7 @@ use std::{cmp, fmt, iter::once}; use fst::{IntoStreamer, Streamer}; use itertools::{EitherOrBoth, merge_join_by}; use log::debug; -use meilisearch_tokenizer::Token; +use meilisearch_tokenizer::{Token, token::SeparatorKind}; use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig}; use sdset::{Set, SetBuf, SetOperation}; @@ -175,13 +175,20 @@ where I: IntoIterator, const MAX_NGRAM: usize = 3; -fn split_query_string(s: &str) -> Vec<(usize, String)> { +fn split_query_string(s: &str, stop_words: HashSet) -> Vec<(usize, String)> { // TODO: Use global instance instead - let analyzer = Analyzer::new(AnalyzerConfig::default()); + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); analyzer .analyze(s) .tokens() - .filter(|t| !t.is_stopword()) + .scan(0, |offset, mut token| { + token.char_index += *offset; + if let Some(SeparatorKind::Hard) = token.is_separator() { + *offset += 8; + } + Some(token) + }) + .filter(|t| t.is_word()) .enumerate() .map(|(i, Token { word, .. })| (i, word.to_string())) .collect() @@ -193,7 +200,13 @@ pub fn create_query_tree( query: &str, ) -> MResult<(Operation, HashMap>)> { - let words = split_query_string(query); + // TODO: use a shared analyzer instance + let words = split_query_string(query, ctx.stop_words + .stream() + .into_strs() + .unwrap_or_default() + .into_iter(). + collect()); let mut mapper = QueryWordsMapper::new(words.iter().map(|(_, w)| w)); diff --git a/meilisearch-core/src/raw_indexer.rs b/meilisearch-core/src/raw_indexer.rs index dd7743e53..510717f4d 100644 --- a/meilisearch-core/src/raw_indexer.rs +++ b/meilisearch-core/src/raw_indexer.rs @@ -1,11 +1,10 @@ use std::borrow::Cow; use std::collections::{BTreeMap, HashMap}; use std::convert::TryFrom; -use std::println; use meilisearch_schema::IndexedPos; use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig}; -use meilisearch_tokenizer::Token; +use meilisearch_tokenizer::{Token, token::SeparatorKind}; use sdset::SetBuf; use crate::{DocIndex, DocumentId}; @@ -45,11 +44,18 @@ impl RawIndexer { let mut number_of_words = 0; let analyzed_text = self.analyzer.analyze(text); - for (word_pos, (token_index, token)) in analyzed_text.tokens().enumerate().filter(|(_, t)| t.is_word()).enumerate() { - print!("token: {}", token.word); + for (word_pos, token) in analyzed_text.tokens() + .scan(0, |offset, mut token| { + token.char_index += *offset; + if let Some(SeparatorKind::Hard) = token.is_separator() { + *offset += 8; + } + Some(token) + }) + .filter(|t| t.is_word()) + .enumerate() { let must_continue = index_token( token, - token_index, word_pos, id, indexed_pos, @@ -72,37 +78,39 @@ impl RawIndexer { where I: IntoIterator, { - let mut token_index_offset = 0; let mut byte_offset = 0; let mut word_offset = 0; for s in iter.into_iter() { - let current_token_index_offset = token_index_offset; let current_byte_offset = byte_offset; let current_word_offset = word_offset; let analyzed_text = self.analyzer.analyze(s); let tokens = analyzed_text .tokens() - .enumerate() - .filter(|(_, t)| t.is_word()) - .map(|(i, mut t)| { + .scan(0, |offset, mut token| { + token.char_index += *offset; + if let Some(SeparatorKind::Hard) = token.is_separator() { + *offset += 8; + } + Some(token) + }) + .filter(|t| t.is_word()) + .map(|mut t| { t.byte_start = t.byte_start + current_byte_offset; t.byte_end = t.byte_end + current_byte_offset; - (i + current_token_index_offset, t) + t }) .enumerate() .map(|(i, t)| (i + current_word_offset, t)); - for (word_pos, (token_index, token)) in tokens { - token_index_offset = token_index + 1; + for (word_pos, token) in tokens { word_offset = word_pos + 1; byte_offset = token.byte_end + 1; let must_continue = index_token( token, word_pos, - token_index, id, indexed_pos, self.word_limit, @@ -144,7 +152,6 @@ impl RawIndexer { fn index_token( token: Token, - position: usize, word_pos: usize, id: DocumentId, indexed_pos: IndexedPos, @@ -153,7 +160,6 @@ fn index_token( docs_words: &mut HashMap>, ) -> bool { - println!(" position {}, limit: {}", position, word_limit); if word_pos >= word_limit { return false; }