From db64e19b8d79ac3b535011ebff17d170969104ea Mon Sep 17 00:00:00 2001 From: mpostma Date: Wed, 2 Dec 2020 15:21:24 +0100 Subject: [PATCH] all tests pass --- meilisearch-core/src/query_tree.rs | 2 +- meilisearch-core/src/raw_indexer.rs | 83 ++++++++++++++--------------- 2 files changed, 41 insertions(+), 44 deletions(-) diff --git a/meilisearch-core/src/query_tree.rs b/meilisearch-core/src/query_tree.rs index 9ecd38f0f..e1485566e 100644 --- a/meilisearch-core/src/query_tree.rs +++ b/meilisearch-core/src/query_tree.rs @@ -8,7 +8,7 @@ use std::{cmp, fmt, iter::once}; use fst::{IntoStreamer, Streamer}; use itertools::{EitherOrBoth, merge_join_by}; use log::debug; -use meilisearch_tokenizer::{Token, token::SeparatorKind, TokenKind}; +use meilisearch_tokenizer::Token; use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig}; use sdset::{Set, SetBuf, SetOperation}; diff --git a/meilisearch-core/src/raw_indexer.rs b/meilisearch-core/src/raw_indexer.rs index a6bff7f0c..8ed709324 100644 --- a/meilisearch-core/src/raw_indexer.rs +++ b/meilisearch-core/src/raw_indexer.rs @@ -46,32 +46,12 @@ where } } + pub fn index_text(&mut self, id: DocumentId, indexed_pos: IndexedPos, text: &str) -> usize { let mut number_of_words = 0; let analyzed_text = self.analyzer.analyze(text); - for (token_pos, (word_pos, token)) in analyzed_text - .tokens() - .scan((0, false), |(offset, is_hard_sep), mut token| { - match token.kind { - TokenKind::Word => { - token.char_index += *offset; - if *is_hard_sep { - *offset += 8; - } else { - *offset += 1; - } - *is_hard_sep = false; - } - TokenKind::Separator(SeparatorKind::Hard) => { - *is_hard_sep = true; - } - _ => (), - } - Some((*offset, token)) - }) - .filter(|(_, t)| t.is_word()) - .enumerate() { + for (token_pos, (word_pos, token)) in process_tokens(analyzed_text.tokens()).enumerate() { let must_continue = index_token( token, word_pos, @@ -105,27 +85,7 @@ where let current_word_offset = word_offset; let analyzed_text = self.analyzer.analyze(s); - let tokens = analyzed_text - .tokens() - .scan((0, false), |(offset, is_hard_sep), mut token| { - match token.kind { - TokenKind::Word | TokenKind::StopWord | TokenKind::Any => { - token.char_index += *offset; - if *is_hard_sep { - *offset += 8; - } else { - *offset += 1; - } - *is_hard_sep = false; - } - TokenKind::Separator(SeparatorKind::Hard) => { - *is_hard_sep = true; - } - _ => (), - } - Some((*offset, token)) - }) - .filter(|(_, t)| t.is_word()) + let tokens = process_tokens(analyzed_text.tokens()) .map(|(i, mut t)| { t.byte_start = t.byte_start + current_byte_offset; t.byte_end = t.byte_end + current_byte_offset; @@ -181,6 +141,31 @@ where } } +fn process_tokens<'a>(tokens: impl Iterator>) -> impl Iterator)> { + tokens + .scan((0, None), |(offset, sepkind), token| { + match token.kind { + TokenKind::Word | TokenKind::StopWord | TokenKind::Any => { + *offset += match *sepkind { + Some(SeparatorKind::Hard) => 8, + Some(SeparatorKind::Soft) => 1, + None => 0, + }; + *sepkind = None; + } + TokenKind::Separator(SeparatorKind::Hard) => { + *sepkind = Some(SeparatorKind::Hard); + } + TokenKind::Separator(SeparatorKind::Soft) if sepkind.is_none() => { + *sepkind = Some(SeparatorKind::Soft); + } + _ => (), + } + Some((*offset, token)) + }) + .filter(|(_, t)| t.is_word()) +} + fn index_token( token: Token, word_pos: usize, @@ -236,6 +221,18 @@ fn token_to_docindex(id: DocumentId, indexed_pos: IndexedPos, token: &Token, wor mod tests { use super::*; use meilisearch_schema::IndexedPos; + use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; + use fst::Set; + + #[test] + fn test_process_token() { + let text = " Zut, l’aspirateur, j’ai oublié de l’éteindre !"; + let stopwords = Set::default(); + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stopwords)); + let analyzer = analyzer.analyze(text); + let tokens: Vec<_> = process_tokens(analyzer.tokens()).collect(); + println!("tokens: {:?}", tokens); + } #[test] fn strange_apostrophe() {