mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-27 04:25:06 +08:00
take hard separator into account
This commit is contained in:
parent
6bf6b40495
commit
246286f0eb
@ -13,7 +13,7 @@ use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType};
|
|||||||
use heed::BytesEncode;
|
use heed::BytesEncode;
|
||||||
use linked_hash_map::LinkedHashMap;
|
use linked_hash_map::LinkedHashMap;
|
||||||
use log::{debug, info};
|
use log::{debug, info};
|
||||||
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
|
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind, token::SeparatorKind};
|
||||||
use ordered_float::OrderedFloat;
|
use ordered_float::OrderedFloat;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
@ -471,14 +471,11 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
let analyzed = self.analyzer.analyze(&content);
|
let analyzed = self.analyzer.analyze(&content);
|
||||||
let tokens = analyzed
|
let tokens = process_tokens(analyzed.tokens());
|
||||||
.tokens()
|
|
||||||
.filter(|t| t.is_word())
|
|
||||||
.map(|t| t.text().to_string());
|
|
||||||
|
|
||||||
for (pos, word) in tokens.enumerate().take(MAX_POSITION) {
|
for (pos, token) in tokens.take_while(|(pos, _)| *pos < MAX_POSITION) {
|
||||||
let position = (attr as usize * MAX_POSITION + pos) as u32;
|
let position = (attr as usize * MAX_POSITION + pos) as u32;
|
||||||
words_positions.entry(word).or_insert_with(SmallVec32::new).push(position);
|
words_positions.entry(token.text().to_string()).or_insert_with(SmallVec32::new).push(position);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -609,6 +606,36 @@ enum FacetValue {
|
|||||||
Integer(i64),
|
Integer(i64),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// take an iterator on tokens and compute their relative position depending on separator kinds
|
||||||
|
/// if it's an `Hard` separator we add an additional relative proximity of 8 between words,
|
||||||
|
/// else we keep the standart proximity of 1 between words.
|
||||||
|
fn process_tokens<'a>(tokens: impl Iterator<Item = Token<'a>>) -> impl Iterator<Item = (usize, Token<'a>)> {
|
||||||
|
tokens
|
||||||
|
.skip_while(|token| token.is_separator().is_some())
|
||||||
|
.scan((0, None), |(offset, prev_kind), token| {
|
||||||
|
match token.kind {
|
||||||
|
TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => {
|
||||||
|
*offset += match *prev_kind {
|
||||||
|
Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
|
||||||
|
Some(_) => 1,
|
||||||
|
None => 0,
|
||||||
|
};
|
||||||
|
*prev_kind = Some(token.kind)
|
||||||
|
}
|
||||||
|
TokenKind::Separator(SeparatorKind::Hard) => {
|
||||||
|
*prev_kind = Some(token.kind);
|
||||||
|
}
|
||||||
|
TokenKind::Separator(SeparatorKind::Soft)
|
||||||
|
if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => {
|
||||||
|
*prev_kind = Some(token.kind);
|
||||||
|
}
|
||||||
|
_ => (),
|
||||||
|
}
|
||||||
|
Some((*offset, token))
|
||||||
|
})
|
||||||
|
.filter(|(_, t)| t.is_word())
|
||||||
|
}
|
||||||
|
|
||||||
fn parse_facet_value(ftype: FacetType, value: &Value) -> anyhow::Result<SmallVec8<FacetValue>> {
|
fn parse_facet_value(ftype: FacetType, value: &Value) -> anyhow::Result<SmallVec8<FacetValue>> {
|
||||||
use FacetValue::*;
|
use FacetValue::*;
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user