fix suggestions

2024-11-23 10:37:41 +08:00 · 2020-12-03 12:34:22 +01:00 · 2020-12-03 12:34:22 +01:00 · 8e64a24d19
commit 8e64a24d19
parent 8b149c9aa3
3 changed files with 15 additions and 23 deletions
--- a/meilisearch-core/src/query_builder.rs
+++ b/meilisearch-core/src/query_builder.rs
@ -395,7 +395,6 @@ mod tests {
            let mut writer = db.main_write_txn().unwrap();
            let word = normalize_str(word);
            println!("synonym: {}", word);
            let alternatives = self
                .index
@ -1261,7 +1260,6 @@ mod tests {
        let builder = store.query_builder();
        let SortResult { documents, .. } = builder.query(&reader, Some("telephone"), 0..20).unwrap();
        println!("documents: {:#?}", documents);
        let mut iter = documents.into_iter();
        assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
@ -1297,7 +1295,6 @@ mod tests {
        let builder = store.query_builder();
        let SortResult { documents, .. } = builder.query(&reader, Some("télephone"), 0..20).unwrap();
        let mut iter = documents.into_iter();
        // this test was in the opposite order, I am not sure why...
        assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
            let mut iter = matches.into_iter();
            assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. }));
--- a/meilisearch-core/src/raw_indexer.rs
+++ b/meilisearch-core/src/raw_indexer.rs
@ -14,10 +14,7 @@ const WORD_LENGTH_LIMIT: usize = 80;
 type Word = Vec<u8>; // TODO make it be a SmallVec
-pub struct RawIndexer<'a, A>
+pub struct RawIndexer<'a, A> {
 where
    A: AsRef<[u8]>
 {
    word_limit: usize, // the maximum number of indexed words
    words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
    docs_words: HashMap<DocumentId, Vec<Word>>,
@ -73,25 +70,24 @@ where
        number_of_words
    }
-    pub fn index_text_seq<'s, I>(&mut self, id: DocumentId, indexed_pos: IndexedPos, iter: I)
+    pub fn index_text_seq<'s, I>(&mut self, id: DocumentId, indexed_pos: IndexedPos, text_iter: I)
    where
        I: IntoIterator<Item = &'s str>,
    {
        let mut byte_offset = 0;
        let mut word_offset = 0;
-        for s in iter.into_iter() {
+        for text in text_iter.into_iter() {
            let current_byte_offset = byte_offset;
            let current_word_offset = word_offset;
-            let analyzed_text = self.analyzer.analyze(s);
+            let analyzed_text = self.analyzer.analyze(text);
            let tokens = process_tokens(analyzed_text.tokens())
                .map(|(i, mut t)| {
                    t.byte_start = t.byte_start + current_byte_offset;
                    t.byte_end = t.byte_end + current_byte_offset;
-                    (i, t)
+                    (i + current_word_offset, t)
                })
                .map(|(i, t)| (i + current_word_offset, t))
                .enumerate();
            for (token_pos, (word_pos, token)) in tokens  {
@ -143,21 +139,22 @@ where
 fn process_tokens<'a>(tokens: impl Iterator<Item = Token<'a>>) -> impl Iterator<Item = (usize, Token<'a>)> {
    tokens
-        .scan((0, None), |(offset, sepkind), token| {
+        .scan((0, None), |(offset, prev_kind), token| {
                match token.kind {
                    TokenKind::Word | TokenKind::StopWord | TokenKind::Any => {
-                        *offset += match *sepkind {
+                        *offset += match *prev_kind {
                            Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
                            Some(_) => 1,
                            None => 0,
                        };
-                        *sepkind = Some(token.kind)
+                        *prev_kind = Some(token.kind)
                    }
                    TokenKind::Separator(SeparatorKind::Hard) => {
-                        *sepkind = Some(token.kind);
+                        *prev_kind = Some(token.kind);
                    }
-                    TokenKind::Separator(SeparatorKind::Soft) if sepkind.is_none() => {
+                    TokenKind::Separator(SeparatorKind::Soft)
-                        *sepkind = Some(token.kind);
+                        if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => {
                        *prev_kind = Some(token.kind);
                    }
                    _ => (),
                }
@ -226,12 +223,12 @@ mod tests {
    #[test]
    fn test_process_token() {
-        let text = " Zut, l’aspirateur, j’ai oublié de l’éteindre !";
+        let text = " 為一包含一千多萬目詞的帶標記平衡語料庫";
        let stopwords = Set::default();
        let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stopwords));
        let analyzer = analyzer.analyze(text);
-        let tokens: Vec<_> = process_tokens(analyzer.tokens()).collect();
+        let tokens: Vec<_> = process_tokens(analyzer.tokens()).map(|(_, t)| t.text().to_string()).collect();
-        println!("tokens: {:?}", tokens);
+        assert_eq!(tokens, ["为", "一", "包含", "一千多万", "目", "词", "的", "带", "标记", "平衡", "语料库"]);
    }
    #[test]
--- a/meilisearch-http/tests/placeholder_search.rs
+++ b/meilisearch-http/tests/placeholder_search.rs
@ -102,8 +102,6 @@ async fn placeholder_search_witch_crop() {
        "cropLength": 20
    });
    println!("here");
    test_post_get_search!(server, query, |response, status_code| {
        assert_eq!(status_code, 200);