From c6434f609c37b20eca87cd4036fa97ebf99b0cbe Mon Sep 17 00:00:00 2001 From: mpostma Date: Thu, 26 Nov 2020 20:01:53 +0100 Subject: [PATCH] fix indexing length --- meilisearch-core/src/query_tree.rs | 18 +++---- meilisearch-core/src/raw_indexer.rs | 56 ++++++++++---------- meilisearch-http/tests/placeholder_search.rs | 2 + 3 files changed, 39 insertions(+), 37 deletions(-) diff --git a/meilisearch-core/src/query_tree.rs b/meilisearch-core/src/query_tree.rs index aae80e395..d9473f301 100644 --- a/meilisearch-core/src/query_tree.rs +++ b/meilisearch-core/src/query_tree.rs @@ -181,27 +181,25 @@ fn split_query_string<'a, A: AsRef<[u8]>>(s: &str, stop_words: &'a fst::Set) analyzer .analyze(s) .tokens() - .scan((0, None), |(offset, sepcat), mut token| { + .scan((0, false), |(offset, is_hard_sep), mut token| { match token.kind { TokenKind::Word | TokenKind::StopWord | TokenKind::Any => { - if let Some(SeparatorKind::Hard) = sepcat { + if *is_hard_sep { *offset += 8; + } else { + *offset += 1; } - *sepcat = None; + *is_hard_sep = false; token.char_index += *offset; } TokenKind::Separator(SeparatorKind::Hard) => { - *sepcat = Some(SeparatorKind::Hard); - } - TokenKind::Separator(SeparatorKind::Soft) if *sepcat != Some(SeparatorKind::Hard) => { - *sepcat = Some(SeparatorKind::Soft); + *is_hard_sep = true; } _ => (), } - Some(token) + Some((*offset, token)) }) - .filter(|t| t.is_word()) - .enumerate() + .filter(|(_, t)| t.is_word()) .map(|(i, Token { word, .. })| (i, word.to_string())) .collect() } diff --git a/meilisearch-core/src/raw_indexer.rs b/meilisearch-core/src/raw_indexer.rs index 0266772f6..fd8a68a43 100644 --- a/meilisearch-core/src/raw_indexer.rs +++ b/meilisearch-core/src/raw_indexer.rs @@ -50,31 +50,32 @@ where let mut number_of_words = 0; let analyzed_text = self.analyzer.analyze(text); - for (word_pos, token) in analyzed_text.tokens() - .scan((0, None), |(offset, sepcat), mut token| { + for (token_pos, (word_pos, token)) in analyzed_text + .tokens() + .scan((0, false), |(offset, is_hard_sep), mut token| { match token.kind { - TokenKind::Word | TokenKind::StopWord | TokenKind::Any => { - if let Some(SeparatorKind::Hard) = sepcat { + TokenKind::Word => { + if *is_hard_sep { *offset += 8; + } else { + *offset += 1; } - *sepcat = None; + *is_hard_sep = false; token.char_index += *offset; } TokenKind::Separator(SeparatorKind::Hard) => { - *sepcat = Some(SeparatorKind::Hard); - } - TokenKind::Separator(SeparatorKind::Soft) if *sepcat != Some(SeparatorKind::Hard) => { - *sepcat = Some(SeparatorKind::Soft); + *is_hard_sep = true; } _ => (), } - Some(token) + Some((*offset, token)) }) - .filter(|t| t.is_word()) + .filter(|(_, t)| t.is_word()) .enumerate() { let must_continue = index_token( token, word_pos, + token_pos, id, indexed_pos, self.word_limit, @@ -106,41 +107,41 @@ where let analyzed_text = self.analyzer.analyze(s); let tokens = analyzed_text .tokens() - .scan((0, None), |(offset, sepcat), mut token| { + .scan((0, false), |(offset, is_hard_sep), mut token| { match token.kind { TokenKind::Word | TokenKind::StopWord | TokenKind::Any => { - if let Some(SeparatorKind::Hard) = sepcat { + if *is_hard_sep { *offset += 8; + } else { + *offset += 1; } - *sepcat = None; + *is_hard_sep = false; token.char_index += *offset; } TokenKind::Separator(SeparatorKind::Hard) => { - *sepcat = Some(SeparatorKind::Hard); - } - TokenKind::Separator(SeparatorKind::Soft) if *sepcat != Some(SeparatorKind::Hard) => { - *sepcat = Some(SeparatorKind::Soft); + *is_hard_sep = true; } _ => (), } - Some(token) + Some((*offset, token)) }) - .filter(|t| t.is_word()) - .map(|mut t| { + .filter(|(_, t)| t.is_word()) + .map(|(i, mut t)| { t.byte_start = t.byte_start + current_byte_offset; t.byte_end = t.byte_end + current_byte_offset; - t + (i, t) }) - .enumerate() - .map(|(i, t)| (i + current_word_offset, t)); + .map(|(i, t)| (i + current_word_offset, t)) + .enumerate(); - for (word_pos, token) in tokens { + for (token_pos, (word_pos, token)) in tokens { word_offset = word_pos + 1; byte_offset = token.byte_end + 1; let must_continue = index_token( token, word_pos, + token_pos, id, indexed_pos, self.word_limit, @@ -183,6 +184,7 @@ where fn index_token( token: Token, word_pos: usize, + token_pos: usize, id: DocumentId, indexed_pos: IndexedPos, word_limit: usize, @@ -190,7 +192,7 @@ fn index_token( docs_words: &mut HashMap>, ) -> bool { - if word_pos >= word_limit { + if token_pos >= word_limit { return false; } @@ -330,7 +332,7 @@ mod tests { let Indexed { words_doc_indexes, .. } = indexer.build(); - assert!(words_doc_indexes.get(&"request_buffering".to_owned().into_bytes()).is_some()); + assert!(words_doc_indexes.get(&"request".to_owned().into_bytes()).is_some()); } #[test] diff --git a/meilisearch-http/tests/placeholder_search.rs b/meilisearch-http/tests/placeholder_search.rs index 048ab7f8b..fb1286248 100644 --- a/meilisearch-http/tests/placeholder_search.rs +++ b/meilisearch-http/tests/placeholder_search.rs @@ -102,6 +102,8 @@ async fn placeholder_search_witch_crop() { "cropLength": 20 }); + println!("here"); + test_post_get_search!(server, query, |response, status_code| { assert_eq!(status_code, 200);