mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-26 12:05:05 +08:00
highlight with new tokenizer
This commit is contained in:
parent
1ae761311e
commit
4f7f7538f7
2
Cargo.lock
generated
2
Cargo.lock
generated
@ -711,7 +711,7 @@ checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00"
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "meilisearch-tokenizer"
|
name = "meilisearch-tokenizer"
|
||||||
version = "0.1.1"
|
version = "0.1.1"
|
||||||
source = "git+https://github.com/meilisearch/Tokenizer.git?branch=token-eq#daeb4a4ac91081f1c592e3ebb3ec5d8dcb4e6976"
|
source = "git+https://github.com/meilisearch/Tokenizer.git?branch=main#147b6154b1b34cb8f5da2df6a416b7da191bc850"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"character_converter",
|
"character_converter",
|
||||||
"cow-utils",
|
"cow-utils",
|
||||||
|
4
http-ui/Cargo.lock
generated
4
http-ui/Cargo.lock
generated
@ -803,10 +803,12 @@ dependencies = [
|
|||||||
"byte-unit",
|
"byte-unit",
|
||||||
"bytes",
|
"bytes",
|
||||||
"flate2",
|
"flate2",
|
||||||
|
"fst",
|
||||||
"futures",
|
"futures",
|
||||||
"grenad",
|
"grenad",
|
||||||
"heed",
|
"heed",
|
||||||
"log",
|
"log",
|
||||||
|
"meilisearch-tokenizer",
|
||||||
"memmap",
|
"memmap",
|
||||||
"milli",
|
"milli",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
@ -1039,7 +1041,7 @@ checksum = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08"
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "meilisearch-tokenizer"
|
name = "meilisearch-tokenizer"
|
||||||
version = "0.1.1"
|
version = "0.1.1"
|
||||||
source = "git+https://github.com/meilisearch/Tokenizer.git?branch=main#8d91cd52f30aa4b651a085c15056938f7b599646"
|
source = "git+https://github.com/meilisearch/Tokenizer.git?branch=token-eq#daeb4a4ac91081f1c592e3ebb3ec5d8dcb4e6976"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"character_converter",
|
"character_converter",
|
||||||
"cow-utils",
|
"cow-utils",
|
||||||
|
@ -32,3 +32,4 @@ warp = "0.2.2"
|
|||||||
# logging
|
# logging
|
||||||
log = "0.4.11"
|
log = "0.4.11"
|
||||||
stderrlog = "0.5.0"
|
stderrlog = "0.5.0"
|
||||||
|
fst = "0.4.5"
|
||||||
|
@ -27,8 +27,9 @@ use tokio::io::AsyncWriteExt;
|
|||||||
use tokio::sync::broadcast;
|
use tokio::sync::broadcast;
|
||||||
use warp::filters::ws::Message;
|
use warp::filters::ws::Message;
|
||||||
use warp::{Filter, http::Response};
|
use warp::{Filter, http::Response};
|
||||||
|
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
|
||||||
|
use fst::Set;
|
||||||
|
|
||||||
use milli::tokenizer::{simple_tokenizer, TokenType};
|
|
||||||
use milli::update::UpdateIndexingStep::*;
|
use milli::update::UpdateIndexingStep::*;
|
||||||
use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat};
|
use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat};
|
||||||
use milli::{obkv_to_json, Index, UpdateStore, SearchResult, FacetCondition};
|
use milli::{obkv_to_json, Index, UpdateStore, SearchResult, FacetCondition};
|
||||||
@ -121,49 +122,61 @@ pub struct IndexerOpt {
|
|||||||
pub indexing_jobs: Option<usize>,
|
pub indexing_jobs: Option<usize>,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn highlight_record(
|
struct Highlighter<'a, A> {
|
||||||
object: &mut Map<String, Value>,
|
analyzer: Analyzer<'a, A>,
|
||||||
words_to_highlight: &HashSet<String>,
|
}
|
||||||
attributes_to_highlight: &HashSet<String>,
|
|
||||||
) {
|
impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
|
||||||
// TODO do we need to create a string for element that are not and needs to be highlight?
|
fn new(stop_words: &'a fst::Set<A>) -> Self {
|
||||||
fn highlight_value(value: Value, words_to_highlight: &HashSet<String>) -> Value {
|
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
|
||||||
|
Self { analyzer }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn highlight_value(&self, value: Value, words_to_highlight: &HashSet<String>) -> Value {
|
||||||
match value {
|
match value {
|
||||||
Value::Null => Value::Null,
|
Value::Null => Value::Null,
|
||||||
Value::Bool(boolean) => Value::Bool(boolean),
|
Value::Bool(boolean) => Value::Bool(boolean),
|
||||||
Value::Number(number) => Value::Number(number),
|
Value::Number(number) => Value::Number(number),
|
||||||
Value::String(old_string) => {
|
Value::String(old_string) => {
|
||||||
let mut string = String::new();
|
let mut string = String::new();
|
||||||
for (token_type, token) in simple_tokenizer(&old_string) {
|
let analyzed = self.analyzer.analyze(&old_string);
|
||||||
if token_type == TokenType::Word {
|
for (word, token) in analyzed.reconstruct() {
|
||||||
let lowercase_token = token.to_lowercase();
|
if token.is_word() {
|
||||||
let to_highlight = words_to_highlight.contains(&lowercase_token);
|
let to_highlight = words_to_highlight.contains(token.text());
|
||||||
if to_highlight { string.push_str("<mark>") }
|
if to_highlight { string.push_str("<mark>") }
|
||||||
string.push_str(token);
|
string.push_str(word);
|
||||||
if to_highlight { string.push_str("</mark>") }
|
if to_highlight { string.push_str("</mark>") }
|
||||||
} else {
|
} else {
|
||||||
string.push_str(token);
|
string.push_str(word);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Value::String(string)
|
Value::String(string)
|
||||||
},
|
},
|
||||||
Value::Array(values) => {
|
Value::Array(values) => {
|
||||||
Value::Array(values.into_iter()
|
Value::Array(values.into_iter()
|
||||||
.map(|v| highlight_value(v, words_to_highlight))
|
.map(|v| self.highlight_value(v, words_to_highlight))
|
||||||
.collect())
|
.collect())
|
||||||
},
|
},
|
||||||
Value::Object(object) => {
|
Value::Object(object) => {
|
||||||
Value::Object(object.into_iter()
|
Value::Object(object.into_iter()
|
||||||
.map(|(k, v)| (k, highlight_value(v, words_to_highlight)))
|
.map(|(k, v)| (k, self.highlight_value(v, words_to_highlight)))
|
||||||
.collect())
|
.collect())
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (key, value) in object.iter_mut() {
|
fn highlight_record(
|
||||||
if attributes_to_highlight.contains(key) {
|
&self,
|
||||||
let old_value = mem::take(value);
|
object: &mut Map<String, Value>,
|
||||||
*value = highlight_value(old_value, words_to_highlight);
|
words_to_highlight: &HashSet<String>,
|
||||||
|
attributes_to_highlight: &HashSet<String>,
|
||||||
|
) {
|
||||||
|
// TODO do we need to create a string for element that are not and needs to be highlight?
|
||||||
|
for (key, value) in object.iter_mut() {
|
||||||
|
if attributes_to_highlight.contains(key) {
|
||||||
|
let old_value = mem::take(value);
|
||||||
|
*value = self.highlight_value(old_value, words_to_highlight);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -651,10 +664,13 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
None => fields_ids_map.iter().map(|(_, name)| name).map(ToOwned::to_owned).collect(),
|
None => fields_ids_map.iter().map(|(_, name)| name).map(ToOwned::to_owned).collect(),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let stop_words = fst::Set::default();
|
||||||
|
let highlighter = Highlighter::new(&stop_words);
|
||||||
|
|
||||||
for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() {
|
for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() {
|
||||||
let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap();
|
let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap();
|
||||||
if !disable_highlighting {
|
if !disable_highlighting {
|
||||||
highlight_record(&mut object, &found_words, &attributes_to_highlight);
|
highlighter.highlight_record(&mut object, &found_words, &attributes_to_highlight);
|
||||||
}
|
}
|
||||||
|
|
||||||
documents.push(object);
|
documents.push(object);
|
||||||
@ -716,7 +732,7 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let file = file.into_std().await;
|
let file = file.into_std().await;
|
||||||
let mmap = unsafe { memmap::Mmap::map(&file).unwrap() };
|
let mmap = unsafe { memmap::Mmap::map(&file).expect("can't map file") };
|
||||||
|
|
||||||
let method = match update_method.as_deref() {
|
let method = match update_method.as_deref() {
|
||||||
Some("replace") => String::from("replace"),
|
Some("replace") => String::from("replace"),
|
||||||
|
@ -54,8 +54,8 @@ mod tests {
|
|||||||
match $test {
|
match $test {
|
||||||
Quoted(val) => assert_eq!(val.text(), $val),
|
Quoted(val) => assert_eq!(val.text(), $val),
|
||||||
Free(val) => panic!("expected Quoted(\"{}\"), found Free(\"{}\")", $val, val.text()),
|
Free(val) => panic!("expected Quoted(\"{}\"), found Free(\"{}\")", $val, val.text()),
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
($test:expr, Free($val:literal)) => {
|
($test:expr, Free($val:literal)) => {
|
||||||
match $test {
|
match $test {
|
||||||
|
@ -8,21 +8,21 @@ use std::{cmp, iter};
|
|||||||
|
|
||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
use bstr::ByteSlice as _;
|
use bstr::ByteSlice as _;
|
||||||
|
use fst::Set;
|
||||||
use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType};
|
use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType};
|
||||||
use heed::BytesEncode;
|
use heed::BytesEncode;
|
||||||
use linked_hash_map::LinkedHashMap;
|
use linked_hash_map::LinkedHashMap;
|
||||||
use log::{debug, info};
|
use log::{debug, info};
|
||||||
|
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
|
||||||
use ordered_float::OrderedFloat;
|
use ordered_float::OrderedFloat;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
use tempfile::tempfile;
|
use tempfile::tempfile;
|
||||||
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, TokenKind};
|
|
||||||
use fst::Set;
|
|
||||||
|
|
||||||
use crate::facet::FacetType;
|
use crate::facet::FacetType;
|
||||||
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
|
|
||||||
use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec};
|
use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec};
|
||||||
use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec};
|
use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec};
|
||||||
|
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
|
||||||
use crate::update::UpdateIndexingStep;
|
use crate::update::UpdateIndexingStep;
|
||||||
use crate::{json_to_string, SmallVec8, SmallVec32, SmallString32, Position, DocumentId, FieldId};
|
use crate::{json_to_string, SmallVec8, SmallVec32, SmallString32, Position, DocumentId, FieldId};
|
||||||
|
|
||||||
@ -167,7 +167,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
|||||||
// MTBL writers
|
// MTBL writers
|
||||||
docid_word_positions_writer,
|
docid_word_positions_writer,
|
||||||
documents_writer,
|
documents_writer,
|
||||||
//tokenizer
|
// tokenizer
|
||||||
analyzer,
|
analyzer,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user