From e5adfaade067ef8aee432f49ff3a65b485c71d31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 22 Sep 2020 10:24:31 +0200 Subject: [PATCH] Replace the token filter by a filter mapper --- src/bin/indexer.rs | 4 ++-- src/tokenizer.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index d091b0912..c216c2778 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -22,7 +22,7 @@ use roaring::RoaringBitmap; use structopt::StructOpt; use milli::heed_codec::{CsvStringRecordCodec, ByteorderXRoaringBitmapCodec}; -use milli::tokenizer::{simple_tokenizer, only_words}; +use milli::tokenizer::{simple_tokenizer, only_token}; use milli::{SmallVec32, Index, DocumentId, BEU32}; const LMDB_MAX_KEY_LENGTH: usize = 511; @@ -290,7 +290,7 @@ impl Store { let document_id = DocumentId::try_from(document_id).context("generated id is too big")?; for (attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) { - for (pos, (_, token)) in simple_tokenizer(&content).filter(only_words).enumerate().take(MAX_POSITION) { + for (pos, token) in simple_tokenizer(&content).filter_map(only_token).enumerate().take(MAX_POSITION) { let word = token.to_lowercase(); let position = (attr * MAX_POSITION + pos) as u32; self.insert_word_docid(&word, document_id)?; diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 782fbdcc5..dca71d744 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -16,6 +16,6 @@ pub fn simple_tokenizer(text: &str) -> impl Iterator { }) } -pub fn only_words((t, _): &(TokenType, &str)) -> bool { - *t == TokenType::Word +pub fn only_token((t, w): (TokenType, &str)) -> Option<&str> { + if t == TokenType::Word { Some(w) } else { None } }