From e5adfaade067ef8aee432f49ff3a65b485c71d31 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Tue, 22 Sep 2020 10:24:31 +0200
Subject: [PATCH] Replace the token filter by a filter mapper

---
 src/bin/indexer.rs | 4 ++--
 src/tokenizer.rs   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs
index d091b0912..c216c2778 100644
--- a/src/bin/indexer.rs
+++ b/src/bin/indexer.rs
@@ -22,7 +22,7 @@ use roaring::RoaringBitmap;
 use structopt::StructOpt;
 
 use milli::heed_codec::{CsvStringRecordCodec, ByteorderXRoaringBitmapCodec};
-use milli::tokenizer::{simple_tokenizer, only_words};
+use milli::tokenizer::{simple_tokenizer, only_token};
 use milli::{SmallVec32, Index, DocumentId, BEU32};
 
 const LMDB_MAX_KEY_LENGTH: usize = 511;
@@ -290,7 +290,7 @@ impl Store {
 
                 let document_id = DocumentId::try_from(document_id).context("generated id is too big")?;
                 for (attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) {
-                    for (pos, (_, token)) in simple_tokenizer(&content).filter(only_words).enumerate().take(MAX_POSITION) {
+                    for (pos, token) in simple_tokenizer(&content).filter_map(only_token).enumerate().take(MAX_POSITION) {
                         let word = token.to_lowercase();
                         let position = (attr * MAX_POSITION + pos) as u32;
                         self.insert_word_docid(&word, document_id)?;
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 782fbdcc5..dca71d744 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -16,6 +16,6 @@ pub fn simple_tokenizer(text: &str) -> impl Iterator<Item=(TokenType, &str)> {
         })
 }
 
-pub fn only_words((t, _): &(TokenType, &str)) -> bool {
-    *t == TokenType::Word
+pub fn only_token((t, w): (TokenType, &str)) -> Option<&str> {
+    if t == TokenType::Word { Some(w) } else { None }
 }