highlight with new tokenizer

This commit is contained in:
mpostma 2020-12-23 20:04:19 +01:00
parent 1ae761311e
commit 4f7f7538f7
No known key found for this signature in database
GPG Key ID: CBC8A7C1D7A28C3A
6 changed files with 49 additions and 30 deletions

2
Cargo.lock generated
View File

@ -711,7 +711,7 @@ checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00"
[[package]] [[package]]
name = "meilisearch-tokenizer" name = "meilisearch-tokenizer"
version = "0.1.1" version = "0.1.1"
source = "git+https://github.com/meilisearch/Tokenizer.git?branch=token-eq#daeb4a4ac91081f1c592e3ebb3ec5d8dcb4e6976" source = "git+https://github.com/meilisearch/Tokenizer.git?branch=main#147b6154b1b34cb8f5da2df6a416b7da191bc850"
dependencies = [ dependencies = [
"character_converter", "character_converter",
"cow-utils", "cow-utils",

4
http-ui/Cargo.lock generated
View File

@ -803,10 +803,12 @@ dependencies = [
"byte-unit", "byte-unit",
"bytes", "bytes",
"flate2", "flate2",
"fst",
"futures", "futures",
"grenad", "grenad",
"heed", "heed",
"log", "log",
"meilisearch-tokenizer",
"memmap", "memmap",
"milli", "milli",
"once_cell", "once_cell",
@ -1039,7 +1041,7 @@ checksum = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08"
[[package]] [[package]]
name = "meilisearch-tokenizer" name = "meilisearch-tokenizer"
version = "0.1.1" version = "0.1.1"
source = "git+https://github.com/meilisearch/Tokenizer.git?branch=main#8d91cd52f30aa4b651a085c15056938f7b599646" source = "git+https://github.com/meilisearch/Tokenizer.git?branch=token-eq#daeb4a4ac91081f1c592e3ebb3ec5d8dcb4e6976"
dependencies = [ dependencies = [
"character_converter", "character_converter",
"cow-utils", "cow-utils",

View File

@ -32,3 +32,4 @@ warp = "0.2.2"
# logging # logging
log = "0.4.11" log = "0.4.11"
stderrlog = "0.5.0" stderrlog = "0.5.0"
fst = "0.4.5"

View File

@ -27,8 +27,9 @@ use tokio::io::AsyncWriteExt;
use tokio::sync::broadcast; use tokio::sync::broadcast;
use warp::filters::ws::Message; use warp::filters::ws::Message;
use warp::{Filter, http::Response}; use warp::{Filter, http::Response};
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
use fst::Set;
use milli::tokenizer::{simple_tokenizer, TokenType};
use milli::update::UpdateIndexingStep::*; use milli::update::UpdateIndexingStep::*;
use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat}; use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat};
use milli::{obkv_to_json, Index, UpdateStore, SearchResult, FacetCondition}; use milli::{obkv_to_json, Index, UpdateStore, SearchResult, FacetCondition};
@ -121,49 +122,61 @@ pub struct IndexerOpt {
pub indexing_jobs: Option<usize>, pub indexing_jobs: Option<usize>,
} }
fn highlight_record( struct Highlighter<'a, A> {
object: &mut Map<String, Value>, analyzer: Analyzer<'a, A>,
words_to_highlight: &HashSet<String>, }
attributes_to_highlight: &HashSet<String>,
) { impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
// TODO do we need to create a string for element that are not and needs to be highlight? fn new(stop_words: &'a fst::Set<A>) -> Self {
fn highlight_value(value: Value, words_to_highlight: &HashSet<String>) -> Value { let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
Self { analyzer }
}
fn highlight_value(&self, value: Value, words_to_highlight: &HashSet<String>) -> Value {
match value { match value {
Value::Null => Value::Null, Value::Null => Value::Null,
Value::Bool(boolean) => Value::Bool(boolean), Value::Bool(boolean) => Value::Bool(boolean),
Value::Number(number) => Value::Number(number), Value::Number(number) => Value::Number(number),
Value::String(old_string) => { Value::String(old_string) => {
let mut string = String::new(); let mut string = String::new();
for (token_type, token) in simple_tokenizer(&old_string) { let analyzed = self.analyzer.analyze(&old_string);
if token_type == TokenType::Word { for (word, token) in analyzed.reconstruct() {
let lowercase_token = token.to_lowercase(); if token.is_word() {
let to_highlight = words_to_highlight.contains(&lowercase_token); let to_highlight = words_to_highlight.contains(token.text());
if to_highlight { string.push_str("<mark>") } if to_highlight { string.push_str("<mark>") }
string.push_str(token); string.push_str(word);
if to_highlight { string.push_str("</mark>") } if to_highlight { string.push_str("</mark>") }
} else { } else {
string.push_str(token); string.push_str(word);
} }
} }
Value::String(string) Value::String(string)
}, },
Value::Array(values) => { Value::Array(values) => {
Value::Array(values.into_iter() Value::Array(values.into_iter()
.map(|v| highlight_value(v, words_to_highlight)) .map(|v| self.highlight_value(v, words_to_highlight))
.collect()) .collect())
}, },
Value::Object(object) => { Value::Object(object) => {
Value::Object(object.into_iter() Value::Object(object.into_iter()
.map(|(k, v)| (k, highlight_value(v, words_to_highlight))) .map(|(k, v)| (k, self.highlight_value(v, words_to_highlight)))
.collect()) .collect())
}, },
} }
} }
for (key, value) in object.iter_mut() { fn highlight_record(
if attributes_to_highlight.contains(key) { &self,
let old_value = mem::take(value); object: &mut Map<String, Value>,
*value = highlight_value(old_value, words_to_highlight); words_to_highlight: &HashSet<String>,
attributes_to_highlight: &HashSet<String>,
) {
// TODO do we need to create a string for element that are not and needs to be highlight?
for (key, value) in object.iter_mut() {
if attributes_to_highlight.contains(key) {
let old_value = mem::take(value);
*value = self.highlight_value(old_value, words_to_highlight);
}
} }
} }
} }
@ -651,10 +664,13 @@ async fn main() -> anyhow::Result<()> {
None => fields_ids_map.iter().map(|(_, name)| name).map(ToOwned::to_owned).collect(), None => fields_ids_map.iter().map(|(_, name)| name).map(ToOwned::to_owned).collect(),
}; };
let stop_words = fst::Set::default();
let highlighter = Highlighter::new(&stop_words);
for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() { for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() {
let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap(); let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap();
if !disable_highlighting { if !disable_highlighting {
highlight_record(&mut object, &found_words, &attributes_to_highlight); highlighter.highlight_record(&mut object, &found_words, &attributes_to_highlight);
} }
documents.push(object); documents.push(object);
@ -716,7 +732,7 @@ async fn main() -> anyhow::Result<()> {
} }
let file = file.into_std().await; let file = file.into_std().await;
let mmap = unsafe { memmap::Mmap::map(&file).unwrap() }; let mmap = unsafe { memmap::Mmap::map(&file).expect("can't map file") };
let method = match update_method.as_deref() { let method = match update_method.as_deref() {
Some("replace") => String::from("replace"), Some("replace") => String::from("replace"),

View File

@ -54,8 +54,8 @@ mod tests {
match $test { match $test {
Quoted(val) => assert_eq!(val.text(), $val), Quoted(val) => assert_eq!(val.text(), $val),
Free(val) => panic!("expected Quoted(\"{}\"), found Free(\"{}\")", $val, val.text()), Free(val) => panic!("expected Quoted(\"{}\"), found Free(\"{}\")", $val, val.text()),
} }
}; };
($test:expr, Free($val:literal)) => { ($test:expr, Free($val:literal)) => {
match $test { match $test {

View File

@ -8,21 +8,21 @@ use std::{cmp, iter};
use anyhow::{bail, Context}; use anyhow::{bail, Context};
use bstr::ByteSlice as _; use bstr::ByteSlice as _;
use fst::Set;
use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType}; use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType};
use heed::BytesEncode; use heed::BytesEncode;
use linked_hash_map::LinkedHashMap; use linked_hash_map::LinkedHashMap;
use log::{debug, info}; use log::{debug, info};
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
use ordered_float::OrderedFloat; use ordered_float::OrderedFloat;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use serde_json::Value; use serde_json::Value;
use tempfile::tempfile; use tempfile::tempfile;
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, TokenKind};
use fst::Set;
use crate::facet::FacetType; use crate::facet::FacetType;
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec}; use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec};
use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec}; use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec};
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
use crate::update::UpdateIndexingStep; use crate::update::UpdateIndexingStep;
use crate::{json_to_string, SmallVec8, SmallVec32, SmallString32, Position, DocumentId, FieldId}; use crate::{json_to_string, SmallVec8, SmallVec32, SmallString32, Position, DocumentId, FieldId};
@ -167,7 +167,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
// MTBL writers // MTBL writers
docid_word_positions_writer, docid_word_positions_writer,
documents_writer, documents_writer,
//tokenizer // tokenizer
analyzer, analyzer,
}) })
} }