mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-26 20:15:07 +08:00
commit
67e25f8724
2
Cargo.lock
generated
2
Cargo.lock
generated
@ -1206,7 +1206,7 @@ checksum = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08"
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "meilisearch-tokenizer"
|
name = "meilisearch-tokenizer"
|
||||||
version = "0.1.1"
|
version = "0.1.1"
|
||||||
source = "git+https://github.com/meilisearch/Tokenizer.git?tag=v0.1.4#31ba3ff4a15501f12b7d37ac64ddce7c35a9757c"
|
source = "git+https://github.com/meilisearch/Tokenizer.git?tag=v0.2.0#833c48b2ee39071f8b4f51abd15122afdb3c8c06"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"character_converter",
|
"character_converter",
|
||||||
"cow-utils",
|
"cow-utils",
|
||||||
|
@ -10,7 +10,7 @@ anyhow = "1.0.38"
|
|||||||
byte-unit = { version = "4.0.9", default-features = false, features = ["std"] }
|
byte-unit = { version = "4.0.9", default-features = false, features = ["std"] }
|
||||||
grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" }
|
grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" }
|
||||||
heed = "0.10.6"
|
heed = "0.10.6"
|
||||||
meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.1.4" }
|
meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.0" }
|
||||||
memmap = "0.7.0"
|
memmap = "0.7.0"
|
||||||
milli = { path = "../milli" }
|
milli = { path = "../milli" }
|
||||||
once_cell = "1.5.2"
|
once_cell = "1.5.2"
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
use std::collections::{BTreeMap, HashMap, HashSet};
|
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
|
||||||
use std::fmt::Display;
|
use std::fmt::Display;
|
||||||
use std::fs::{File, create_dir_all};
|
use std::fs::{File, create_dir_all};
|
||||||
use std::net::SocketAddr;
|
use std::net::SocketAddr;
|
||||||
@ -128,7 +128,10 @@ struct Highlighter<'a, A> {
|
|||||||
|
|
||||||
impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
|
impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
|
||||||
fn new(stop_words: &'a fst::Set<A>) -> Self {
|
fn new(stop_words: &'a fst::Set<A>) -> Self {
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
|
let mut config = AnalyzerConfig::default();
|
||||||
|
config.stop_words(stop_words);
|
||||||
|
let analyzer = Analyzer::new(config);
|
||||||
|
|
||||||
Self { analyzer }
|
Self { analyzer }
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -266,6 +269,13 @@ struct Settings {
|
|||||||
skip_serializing_if = "Option::is_none",
|
skip_serializing_if = "Option::is_none",
|
||||||
)]
|
)]
|
||||||
criteria: Option<Option<Vec<String>>>,
|
criteria: Option<Option<Vec<String>>>,
|
||||||
|
|
||||||
|
#[serde(
|
||||||
|
default,
|
||||||
|
deserialize_with = "deserialize_some",
|
||||||
|
skip_serializing_if = "Option::is_none",
|
||||||
|
)]
|
||||||
|
stop_words: Option<Option<BTreeSet<String>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
@ -439,6 +449,14 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// We transpose the settings JSON struct into a real setting update.
|
||||||
|
if let Some(stop_words) = settings.stop_words {
|
||||||
|
match stop_words {
|
||||||
|
Some(stop_words) => builder.set_stop_words(stop_words),
|
||||||
|
None => builder.reset_stop_words(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let result = builder.execute(|indexing_step, update_id| {
|
let result = builder.execute(|indexing_step, update_id| {
|
||||||
let (current, total) = match indexing_step {
|
let (current, total) = match indexing_step {
|
||||||
TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None),
|
TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None),
|
||||||
|
@ -20,7 +20,7 @@ heed = { version = "0.10.6", default-features = false, features = ["lmdb", "sync
|
|||||||
human_format = "1.0.3"
|
human_format = "1.0.3"
|
||||||
levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
|
levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
|
||||||
linked-hash-map = "0.5.4"
|
linked-hash-map = "0.5.4"
|
||||||
meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.1.4" }
|
meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.0" }
|
||||||
memmap = "0.7.0"
|
memmap = "0.7.0"
|
||||||
num-traits = "0.2.14"
|
num-traits = "0.2.14"
|
||||||
obkv = "0.1.1"
|
obkv = "0.1.1"
|
||||||
|
@ -28,6 +28,7 @@ pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields";
|
|||||||
pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids";
|
pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids";
|
||||||
pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids";
|
pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids";
|
||||||
pub const WORDS_FST_KEY: &str = "words-fst";
|
pub const WORDS_FST_KEY: &str = "words-fst";
|
||||||
|
pub const STOP_WORDS_KEY: &str = "stop-words";
|
||||||
pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst";
|
pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst";
|
||||||
const CREATED_AT_KEY: &str = "created-at";
|
const CREATED_AT_KEY: &str = "created-at";
|
||||||
const UPDATED_AT_KEY: &str = "updated-at";
|
const UPDATED_AT_KEY: &str = "updated-at";
|
||||||
@ -377,6 +378,22 @@ impl Index {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* stop words */
|
||||||
|
|
||||||
|
pub fn put_stop_words<A: AsRef<[u8]>>(&self, wtxn: &mut RwTxn, fst: &fst::Set<A>) -> heed::Result<()> {
|
||||||
|
self.main.put::<_, Str, ByteSlice>(wtxn, STOP_WORDS_KEY, fst.as_fst().as_bytes())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn delete_stop_words(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
|
||||||
|
self.main.delete::<_, Str>(wtxn, STOP_WORDS_KEY)
|
||||||
|
}
|
||||||
|
pub fn stop_words<'t>(&self, rtxn: &'t RoTxn) -> anyhow::Result<Option<fst::Set<&'t [u8]>>> {
|
||||||
|
match self.main.get::<_, Str, ByteSlice>(rtxn, STOP_WORDS_KEY)? {
|
||||||
|
Some(bytes) => Ok(Some(fst::Set::new(bytes)?)),
|
||||||
|
None => Ok(None),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* words prefixes fst */
|
/* words prefixes fst */
|
||||||
|
|
||||||
/// Writes the FST which is the words prefixes dictionnary of the engine.
|
/// Writes the FST which is the words prefixes dictionnary of the engine.
|
||||||
|
@ -4,7 +4,7 @@ use std::fmt;
|
|||||||
use std::str::Utf8Error;
|
use std::str::Utf8Error;
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
|
|
||||||
use fst::{IntoStreamer, Streamer, Set};
|
use fst::{IntoStreamer, Streamer};
|
||||||
use levenshtein_automata::{DFA, LevenshteinAutomatonBuilder as LevBuilder};
|
use levenshtein_automata::{DFA, LevenshteinAutomatonBuilder as LevBuilder};
|
||||||
use log::debug;
|
use log::debug;
|
||||||
use meilisearch_tokenizer::{AnalyzerConfig, Analyzer};
|
use meilisearch_tokenizer::{AnalyzerConfig, Analyzer};
|
||||||
@ -91,8 +91,7 @@ impl<'a> Search<'a> {
|
|||||||
let mut builder = QueryTreeBuilder::new(self.rtxn, self.index);
|
let mut builder = QueryTreeBuilder::new(self.rtxn, self.index);
|
||||||
builder.optional_words(self.optional_words);
|
builder.optional_words(self.optional_words);
|
||||||
builder.authorize_typos(self.authorize_typos);
|
builder.authorize_typos(self.authorize_typos);
|
||||||
let stop_words = &Set::default();
|
let analyzer = Analyzer::<Vec<u8>>::new(AnalyzerConfig::default());
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
|
|
||||||
let result = analyzer.analyze(query);
|
let result = analyzer.analyze(query);
|
||||||
let tokens = result.tokens();
|
let tokens = result.tokens();
|
||||||
builder.build(tokens)?
|
builder.build(tokens)?
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::{fmt, cmp, mem};
|
use std::{fmt, cmp, mem};
|
||||||
|
|
||||||
|
use fst::Set;
|
||||||
use levenshtein_automata::{DFA, Distance};
|
use levenshtein_automata::{DFA, Distance};
|
||||||
use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream};
|
use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
@ -154,6 +155,10 @@ impl fmt::Debug for Query {
|
|||||||
|
|
||||||
trait Context {
|
trait Context {
|
||||||
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
||||||
|
fn stop_words(&self) -> anyhow::Result<Option<Set<&[u8]>>>;
|
||||||
|
fn is_stop_word(&self, word: &str) -> anyhow::Result<bool> {
|
||||||
|
Ok(self.stop_words()?.map_or(false, |s| s.contains(word)))
|
||||||
|
}
|
||||||
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>>;
|
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>>;
|
||||||
fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
|
fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
|
||||||
match self.word_docids(word)? {
|
match self.word_docids(word)? {
|
||||||
@ -183,6 +188,10 @@ impl<'a> Context for QueryTreeBuilder<'a> {
|
|||||||
fn synonyms<S: AsRef<str>>(&self, _words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
|
fn synonyms<S: AsRef<str>>(&self, _words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
|
||||||
Ok(None)
|
Ok(None)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn stop_words(&self) -> anyhow::Result<Option<Set<&[u8]>>> {
|
||||||
|
self.index.stop_words(self.rtxn)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> QueryTreeBuilder<'a> {
|
impl<'a> QueryTreeBuilder<'a> {
|
||||||
@ -331,8 +340,7 @@ fn create_query_tree(
|
|||||||
optional_words: bool,
|
optional_words: bool,
|
||||||
authorize_typos: bool,
|
authorize_typos: bool,
|
||||||
query: PrimitiveQuery,
|
query: PrimitiveQuery,
|
||||||
) -> anyhow::Result<Operation>
|
) -> anyhow::Result<Operation> {
|
||||||
{
|
|
||||||
/// Matches on the `PrimitiveQueryPart` and create an operation from it.
|
/// Matches on the `PrimitiveQueryPart` and create an operation from it.
|
||||||
fn resolve_primitive_part(
|
fn resolve_primitive_part(
|
||||||
ctx: &impl Context,
|
ctx: &impl Context,
|
||||||
@ -350,7 +358,12 @@ fn create_query_tree(
|
|||||||
if let Some(child) = split_best_frequency(ctx, &word)? {
|
if let Some(child) = split_best_frequency(ctx, &word)? {
|
||||||
children.push(child);
|
children.push(child);
|
||||||
}
|
}
|
||||||
children.push(Operation::Query(Query { prefix, kind: typos(word, authorize_typos) }));
|
|
||||||
|
let is_stop_word = ctx.is_stop_word(&word)?;
|
||||||
|
let query = Query { prefix, kind: typos(word, authorize_typos) };
|
||||||
|
if query.prefix || query.kind.is_tolerant() || !is_stop_word {
|
||||||
|
children.push(Operation::Query(query));
|
||||||
|
}
|
||||||
Ok(Operation::or(false, children))
|
Ok(Operation::or(false, children))
|
||||||
},
|
},
|
||||||
// create a CONSECUTIVE operation wrapping all word in the phrase
|
// create a CONSECUTIVE operation wrapping all word in the phrase
|
||||||
@ -365,12 +378,11 @@ fn create_query_tree(
|
|||||||
ctx: &impl Context,
|
ctx: &impl Context,
|
||||||
authorize_typos: bool,
|
authorize_typos: bool,
|
||||||
query: &[PrimitiveQueryPart],
|
query: &[PrimitiveQueryPart],
|
||||||
) -> anyhow::Result<Operation>
|
) -> anyhow::Result<Operation> {
|
||||||
{
|
|
||||||
const MAX_NGRAM: usize = 3;
|
const MAX_NGRAM: usize = 3;
|
||||||
let mut op_children = Vec::new();
|
let mut op_children = Vec::new();
|
||||||
|
|
||||||
for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase()) ) {
|
for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase())) {
|
||||||
let mut or_op_children = Vec::new();
|
let mut or_op_children = Vec::new();
|
||||||
|
|
||||||
for ngram in 1..=MAX_NGRAM.min(sub_query.len()) {
|
for ngram in 1..=MAX_NGRAM.min(sub_query.len()) {
|
||||||
@ -381,23 +393,31 @@ fn create_query_tree(
|
|||||||
|
|
||||||
match group {
|
match group {
|
||||||
[part] => {
|
[part] => {
|
||||||
let operation = resolve_primitive_part(ctx, authorize_typos, part.clone())?;
|
let operation =
|
||||||
|
resolve_primitive_part(ctx, authorize_typos, part.clone())?;
|
||||||
and_op_children.push(operation);
|
and_op_children.push(operation);
|
||||||
},
|
}
|
||||||
words => {
|
words => {
|
||||||
let is_prefix = words.last().map(|part| part.is_prefix()).unwrap_or(false);
|
let is_prefix = words.last().map_or(false, |part| part.is_prefix());
|
||||||
let words: Vec<_> = words.iter().filter_map(| part| {
|
let words: Vec<_> = words
|
||||||
if let PrimitiveQueryPart::Word(word, _) = part {
|
.iter()
|
||||||
Some(word.as_str())
|
.filter_map(|part| {
|
||||||
} else {
|
if let PrimitiveQueryPart::Word(word, _) = part {
|
||||||
None
|
Some(word.as_str())
|
||||||
}
|
} else {
|
||||||
}).collect();
|
None
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
let mut operations = synonyms(ctx, &words)?.unwrap_or_default();
|
let mut operations = synonyms(ctx, &words)?.unwrap_or_default();
|
||||||
let concat = words.concat();
|
let concat = words.concat();
|
||||||
|
|
||||||
|
let is_stop_word = ctx.is_stop_word(&concat)?;
|
||||||
let query = Query { prefix: is_prefix, kind: typos(concat, authorize_typos) };
|
let query = Query { prefix: is_prefix, kind: typos(concat, authorize_typos) };
|
||||||
operations.push(Operation::Query(query));
|
if query.prefix || query.kind.is_tolerant() || !is_stop_word {
|
||||||
and_op_children.push(Operation::or(false, operations));
|
operations.push(Operation::Query(query));
|
||||||
|
and_op_children.push(Operation::or(false, operations));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -543,7 +563,6 @@ pub fn maximum_proximity(operation: &Operation) -> usize {
|
|||||||
mod test {
|
mod test {
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
|
||||||
use fst::Set;
|
|
||||||
use maplit::{hashmap, hashset};
|
use maplit::{hashmap, hashset};
|
||||||
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
|
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
|
||||||
use rand::{Rng, SeedableRng, rngs::StdRng};
|
use rand::{Rng, SeedableRng, rngs::StdRng};
|
||||||
@ -582,6 +601,10 @@ mod test {
|
|||||||
let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect();
|
let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect();
|
||||||
Ok(self.synonyms.get(&words).cloned())
|
Ok(self.synonyms.get(&words).cloned())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn stop_words(&self) -> anyhow::Result<Option<Set<&[u8]>>> {
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for TestContext {
|
impl Default for TestContext {
|
||||||
@ -646,8 +669,7 @@ mod test {
|
|||||||
#[test]
|
#[test]
|
||||||
fn prefix() {
|
fn prefix() {
|
||||||
let query = "hey friends";
|
let query = "hey friends";
|
||||||
let stop_words = &Set::default();
|
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
|
|
||||||
let result = analyzer.analyze(query);
|
let result = analyzer.analyze(query);
|
||||||
let tokens = result.tokens();
|
let tokens = result.tokens();
|
||||||
|
|
||||||
@ -667,8 +689,7 @@ mod test {
|
|||||||
#[test]
|
#[test]
|
||||||
fn no_prefix() {
|
fn no_prefix() {
|
||||||
let query = "hey friends ";
|
let query = "hey friends ";
|
||||||
let stop_words = &Set::default();
|
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
|
|
||||||
let result = analyzer.analyze(query);
|
let result = analyzer.analyze(query);
|
||||||
let tokens = result.tokens();
|
let tokens = result.tokens();
|
||||||
|
|
||||||
@ -688,8 +709,7 @@ mod test {
|
|||||||
#[test]
|
#[test]
|
||||||
fn synonyms() {
|
fn synonyms() {
|
||||||
let query = "hello world ";
|
let query = "hello world ";
|
||||||
let stop_words = &Set::default();
|
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
|
|
||||||
let result = analyzer.analyze(query);
|
let result = analyzer.analyze(query);
|
||||||
let tokens = result.tokens();
|
let tokens = result.tokens();
|
||||||
|
|
||||||
@ -720,8 +740,7 @@ mod test {
|
|||||||
#[test]
|
#[test]
|
||||||
fn complex_synonyms() {
|
fn complex_synonyms() {
|
||||||
let query = "new york city ";
|
let query = "new york city ";
|
||||||
let stop_words = &Set::default();
|
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
|
|
||||||
let result = analyzer.analyze(query);
|
let result = analyzer.analyze(query);
|
||||||
let tokens = result.tokens();
|
let tokens = result.tokens();
|
||||||
|
|
||||||
@ -766,8 +785,7 @@ mod test {
|
|||||||
#[test]
|
#[test]
|
||||||
fn ngrams() {
|
fn ngrams() {
|
||||||
let query = "n grams ";
|
let query = "n grams ";
|
||||||
let stop_words = &Set::default();
|
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
|
|
||||||
let result = analyzer.analyze(query);
|
let result = analyzer.analyze(query);
|
||||||
let tokens = result.tokens();
|
let tokens = result.tokens();
|
||||||
|
|
||||||
@ -787,8 +805,7 @@ mod test {
|
|||||||
#[test]
|
#[test]
|
||||||
fn word_split() {
|
fn word_split() {
|
||||||
let query = "wordsplit fish ";
|
let query = "wordsplit fish ";
|
||||||
let stop_words = &Set::default();
|
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
|
|
||||||
let result = analyzer.analyze(query);
|
let result = analyzer.analyze(query);
|
||||||
let tokens = result.tokens();
|
let tokens = result.tokens();
|
||||||
|
|
||||||
@ -814,8 +831,7 @@ mod test {
|
|||||||
#[test]
|
#[test]
|
||||||
fn phrase() {
|
fn phrase() {
|
||||||
let query = "\"hey friends\" \" \" \"wooop";
|
let query = "\"hey friends\" \" \" \"wooop";
|
||||||
let stop_words = &Set::default();
|
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
|
|
||||||
let result = analyzer.analyze(query);
|
let result = analyzer.analyze(query);
|
||||||
let tokens = result.tokens();
|
let tokens = result.tokens();
|
||||||
|
|
||||||
@ -835,8 +851,7 @@ mod test {
|
|||||||
#[test]
|
#[test]
|
||||||
fn optional_word() {
|
fn optional_word() {
|
||||||
let query = "hey my friend ";
|
let query = "hey my friend ";
|
||||||
let stop_words = &Set::default();
|
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
|
|
||||||
let result = analyzer.analyze(query);
|
let result = analyzer.analyze(query);
|
||||||
let tokens = result.tokens();
|
let tokens = result.tokens();
|
||||||
|
|
||||||
@ -875,8 +890,7 @@ mod test {
|
|||||||
#[test]
|
#[test]
|
||||||
fn optional_word_phrase() {
|
fn optional_word_phrase() {
|
||||||
let query = "\"hey my\"";
|
let query = "\"hey my\"";
|
||||||
let stop_words = &Set::default();
|
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
|
|
||||||
let result = analyzer.analyze(query);
|
let result = analyzer.analyze(query);
|
||||||
let tokens = result.tokens();
|
let tokens = result.tokens();
|
||||||
|
|
||||||
@ -892,8 +906,7 @@ mod test {
|
|||||||
#[test]
|
#[test]
|
||||||
fn optional_word_multiple_phrases() {
|
fn optional_word_multiple_phrases() {
|
||||||
let query = r#""hey" my good "friend""#;
|
let query = r#""hey" my good "friend""#;
|
||||||
let stop_words = &Set::default();
|
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
|
|
||||||
let result = analyzer.analyze(query);
|
let result = analyzer.analyze(query);
|
||||||
let tokens = result.tokens();
|
let tokens = result.tokens();
|
||||||
|
|
||||||
@ -927,8 +940,7 @@ mod test {
|
|||||||
#[test]
|
#[test]
|
||||||
fn no_typo() {
|
fn no_typo() {
|
||||||
let query = "hey friends ";
|
let query = "hey friends ";
|
||||||
let stop_words = &Set::default();
|
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
|
|
||||||
let result = analyzer.analyze(query);
|
let result = analyzer.analyze(query);
|
||||||
let tokens = result.tokens();
|
let tokens = result.tokens();
|
||||||
|
|
||||||
@ -947,8 +959,7 @@ mod test {
|
|||||||
#[test]
|
#[test]
|
||||||
fn fetching_words() {
|
fn fetching_words() {
|
||||||
let query = "wordsplit nyc world";
|
let query = "wordsplit nyc world";
|
||||||
let stop_words = &Set::default();
|
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
|
|
||||||
let result = analyzer.analyze(query);
|
let result = analyzer.analyze(query);
|
||||||
let tokens = result.tokens();
|
let tokens = result.tokens();
|
||||||
|
|
||||||
|
@ -410,6 +410,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
None => fields_ids_map.iter().map(|(id, _name)| id).collect(),
|
None => fields_ids_map.iter().map(|(id, _name)| id).collect(),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let stop_words = self.index.stop_words(self.wtxn)?;
|
||||||
|
let stop_words = stop_words.as_ref();
|
||||||
let linked_hash_map_size = self.linked_hash_map_size;
|
let linked_hash_map_size = self.linked_hash_map_size;
|
||||||
let max_nb_chunks = self.max_nb_chunks;
|
let max_nb_chunks = self.max_nb_chunks;
|
||||||
let max_memory = self.max_memory;
|
let max_memory = self.max_memory;
|
||||||
@ -436,7 +438,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
let readers = rayon::iter::repeatn(documents, num_threads)
|
let readers = rayon::iter::repeatn(documents, num_threads)
|
||||||
.enumerate()
|
.enumerate()
|
||||||
.map(|(i, documents)| {
|
.map(|(i, documents)| {
|
||||||
let stop_words = fst::Set::default();
|
|
||||||
let store = Store::new(
|
let store = Store::new(
|
||||||
searchable_fields.clone(),
|
searchable_fields.clone(),
|
||||||
faceted_fields.clone(),
|
faceted_fields.clone(),
|
||||||
@ -446,7 +447,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
chunk_compression_type,
|
chunk_compression_type,
|
||||||
chunk_compression_level,
|
chunk_compression_level,
|
||||||
chunk_fusing_shrink_size,
|
chunk_fusing_shrink_size,
|
||||||
&stop_words,
|
stop_words,
|
||||||
)?;
|
)?;
|
||||||
store.index(
|
store.index(
|
||||||
documents,
|
documents,
|
||||||
|
@ -86,7 +86,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
|||||||
chunk_compression_type: CompressionType,
|
chunk_compression_type: CompressionType,
|
||||||
chunk_compression_level: Option<u32>,
|
chunk_compression_level: Option<u32>,
|
||||||
chunk_fusing_shrink_size: Option<u64>,
|
chunk_fusing_shrink_size: Option<u64>,
|
||||||
stop_words: &'s Set<A>,
|
stop_words: Option<&'s Set<A>>,
|
||||||
) -> anyhow::Result<Self>
|
) -> anyhow::Result<Self>
|
||||||
{
|
{
|
||||||
// We divide the max memory by the number of sorter the Store have.
|
// We divide the max memory by the number of sorter the Store have.
|
||||||
@ -141,7 +141,11 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
|||||||
create_writer(chunk_compression_type, chunk_compression_level, f)
|
create_writer(chunk_compression_type, chunk_compression_level, f)
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
|
let mut config = AnalyzerConfig::default();
|
||||||
|
if let Some(stop_words) = stop_words {
|
||||||
|
config.stop_words(stop_words);
|
||||||
|
}
|
||||||
|
let analyzer = Analyzer::new(config);
|
||||||
|
|
||||||
Ok(Store {
|
Ok(Store {
|
||||||
// Indexing parameters.
|
// Indexing parameters.
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
use std::collections::HashMap;
|
use std::collections::{BTreeSet, HashMap};
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
|
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
@ -32,6 +32,7 @@ pub struct Settings<'a, 't, 'u, 'i> {
|
|||||||
displayed_fields: Option<Option<Vec<String>>>,
|
displayed_fields: Option<Option<Vec<String>>>,
|
||||||
faceted_fields: Option<Option<HashMap<String, String>>>,
|
faceted_fields: Option<Option<HashMap<String, String>>>,
|
||||||
criteria: Option<Option<Vec<String>>>,
|
criteria: Option<Option<Vec<String>>>,
|
||||||
|
stop_words: Option<Option<BTreeSet<String>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||||
@ -55,6 +56,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
displayed_fields: None,
|
displayed_fields: None,
|
||||||
faceted_fields: None,
|
faceted_fields: None,
|
||||||
criteria: None,
|
criteria: None,
|
||||||
|
stop_words: None,
|
||||||
update_id,
|
update_id,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -91,6 +93,18 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
self.criteria = Some(Some(criteria));
|
self.criteria = Some(Some(criteria));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn reset_stop_words(&mut self) {
|
||||||
|
self.stop_words = Some(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn set_stop_words(&mut self, stop_words: BTreeSet<String>) {
|
||||||
|
self.stop_words = if stop_words.is_empty() {
|
||||||
|
Some(None)
|
||||||
|
} else {
|
||||||
|
Some(Some(stop_words))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn reindex<F>(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> anyhow::Result<()>
|
fn reindex<F>(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> anyhow::Result<()>
|
||||||
where
|
where
|
||||||
F: Fn(UpdateIndexingStep, u64) + Sync
|
F: Fn(UpdateIndexingStep, u64) + Sync
|
||||||
@ -210,6 +224,28 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
Ok(true)
|
Ok(true)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn update_stop_words(&mut self) -> anyhow::Result<bool> {
|
||||||
|
match self.stop_words {
|
||||||
|
Some(Some(ref stop_words)) => {
|
||||||
|
let current = self.index.stop_words(self.wtxn)?;
|
||||||
|
// since we can't compare a BTreeSet with an FST we are going to convert the
|
||||||
|
// BTreeSet to an FST and then compare bytes per bytes the two FSTs.
|
||||||
|
let fst = fst::Set::from_iter(&*stop_words)?;
|
||||||
|
|
||||||
|
// Does the new FST differ from the previous one?
|
||||||
|
if current.map_or(true, |current| current.as_fst().as_bytes() != fst.as_fst().as_bytes()) {
|
||||||
|
// we want to re-create our FST.
|
||||||
|
self.index.put_stop_words(self.wtxn, &fst)?;
|
||||||
|
Ok(true)
|
||||||
|
} else {
|
||||||
|
Ok(false)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Some(None) => Ok(self.index.delete_stop_words(self.wtxn)?),
|
||||||
|
None => Ok(false),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn update_facets(&mut self) -> anyhow::Result<bool> {
|
fn update_facets(&mut self) -> anyhow::Result<bool> {
|
||||||
match self.faceted_fields {
|
match self.faceted_fields {
|
||||||
Some(Some(ref fields)) => {
|
Some(Some(ref fields)) => {
|
||||||
@ -248,22 +284,23 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
|
|
||||||
pub fn execute<F>(mut self, progress_callback: F) -> anyhow::Result<()>
|
pub fn execute<F>(mut self, progress_callback: F) -> anyhow::Result<()>
|
||||||
where
|
where
|
||||||
F: Fn(UpdateIndexingStep, u64) + Sync
|
F: Fn(UpdateIndexingStep, u64) + Sync
|
||||||
{
|
{
|
||||||
self.index.set_updated_at(self.wtxn, &Utc::now())?;
|
self.index.set_updated_at(self.wtxn, &Utc::now())?;
|
||||||
let old_fields_ids_map = self.index.fields_ids_map(&self.wtxn)?;
|
let old_fields_ids_map = self.index.fields_ids_map(&self.wtxn)?;
|
||||||
self.update_displayed()?;
|
self.update_displayed()?;
|
||||||
let facets_updated = self.update_facets()?;
|
let stop_words_updated = self.update_stop_words()?;
|
||||||
// update_criteria MUST be called after update_facets, since criterion fields must be set
|
let facets_updated = self.update_facets()?;
|
||||||
// as facets.
|
// update_criteria MUST be called after update_facets, since criterion fields must be set
|
||||||
self.update_criteria()?;
|
// as facets.
|
||||||
let searchable_updated = self.update_searchable()?;
|
self.update_criteria()?;
|
||||||
|
let searchable_updated = self.update_searchable()?;
|
||||||
|
|
||||||
if facets_updated || searchable_updated {
|
if facets_updated || searchable_updated || stop_words_updated {
|
||||||
self.reindex(&progress_callback, old_fields_ids_map)?;
|
self.reindex(&progress_callback, old_fields_ids_map)?;
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@ -271,7 +308,7 @@ mod tests {
|
|||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
use heed::EnvOpenOptions;
|
use heed::EnvOpenOptions;
|
||||||
use maplit::hashmap;
|
use maplit::{hashmap, btreeset};
|
||||||
|
|
||||||
use crate::facet::FacetType;
|
use crate::facet::FacetType;
|
||||||
use crate::update::{IndexDocuments, UpdateFormat};
|
use crate::update::{IndexDocuments, UpdateFormat};
|
||||||
@ -328,7 +365,6 @@ mod tests {
|
|||||||
assert_eq!(result.documents_ids.len(), 1);
|
assert_eq!(result.documents_ids.len(), 1);
|
||||||
let documents = index.documents(&rtxn, result.documents_ids).unwrap();
|
let documents = index.documents(&rtxn, result.documents_ids).unwrap();
|
||||||
assert_eq!(documents[0].1.get(0), Some(&br#""kevin""#[..]));
|
assert_eq!(documents[0].1.get(0), Some(&br#""kevin""#[..]));
|
||||||
drop(rtxn);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@ -372,7 +408,6 @@ mod tests {
|
|||||||
let rtxn = index.read_txn().unwrap();
|
let rtxn = index.read_txn().unwrap();
|
||||||
let fields_ids = index.displayed_fields(&rtxn).unwrap();
|
let fields_ids = index.displayed_fields(&rtxn).unwrap();
|
||||||
assert_eq!(fields_ids.unwrap(), &["age"][..]);
|
assert_eq!(fields_ids.unwrap(), &["age"][..]);
|
||||||
drop(rtxn);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@ -394,7 +429,6 @@ mod tests {
|
|||||||
let rtxn = index.read_txn().unwrap();
|
let rtxn = index.read_txn().unwrap();
|
||||||
let fields_ids = index.displayed_fields(&rtxn).unwrap();
|
let fields_ids = index.displayed_fields(&rtxn).unwrap();
|
||||||
assert_eq!(fields_ids, None);
|
assert_eq!(fields_ids, None);
|
||||||
drop(rtxn);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@ -434,7 +468,6 @@ mod tests {
|
|||||||
let rtxn = index.read_txn().unwrap();
|
let rtxn = index.read_txn().unwrap();
|
||||||
let fields_ids = index.displayed_fields(&rtxn).unwrap();
|
let fields_ids = index.displayed_fields(&rtxn).unwrap();
|
||||||
assert_eq!(fields_ids, None);
|
assert_eq!(fields_ids, None);
|
||||||
drop(rtxn);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@ -478,7 +511,96 @@ mod tests {
|
|||||||
// Only count the field_id 0 and level 0 facet values.
|
// Only count the field_id 0 and level 0 facet values.
|
||||||
let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[0, 0]).unwrap().count();
|
let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[0, 0]).unwrap().count();
|
||||||
assert_eq!(count, 4);
|
assert_eq!(count, 4);
|
||||||
drop(rtxn);
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn default_stop_words() {
|
||||||
|
let path = tempfile::tempdir().unwrap();
|
||||||
|
let mut options = EnvOpenOptions::new();
|
||||||
|
options.map_size(10 * 1024 * 1024); // 10 MB
|
||||||
|
let index = Index::new(options, &path).unwrap();
|
||||||
|
|
||||||
|
// First we send 3 documents with ids from 1 to 3.
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
|
||||||
|
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||||
|
builder.update_format(UpdateFormat::Csv);
|
||||||
|
builder.execute(content, |_, _| ()).unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
// Ensure there is no stop_words by default
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
let stop_words = index.stop_words(&rtxn).unwrap();
|
||||||
|
assert!(stop_words.is_none());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn set_and_reset_stop_words() {
|
||||||
|
let path = tempfile::tempdir().unwrap();
|
||||||
|
let mut options = EnvOpenOptions::new();
|
||||||
|
options.map_size(10 * 1024 * 1024); // 10 MB
|
||||||
|
let index = Index::new(options, &path).unwrap();
|
||||||
|
|
||||||
|
// First we send 3 documents with ids from 1 to 3.
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let content = &b"name,age,maxim\nkevin,23,I love dogs\nkevina,21,Doggos are the best\nbenoit,34,The crepes are really good\n"[..];
|
||||||
|
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||||
|
builder.update_format(UpdateFormat::Csv);
|
||||||
|
builder.execute(content, |_, _| ()).unwrap();
|
||||||
|
|
||||||
|
// In the same transaction we provide some stop_words
|
||||||
|
let mut builder = Settings::new(&mut wtxn, &index, 0);
|
||||||
|
let set = btreeset!{ "i".to_string(), "the".to_string(), "are".to_string() };
|
||||||
|
builder.set_stop_words(set.clone());
|
||||||
|
builder.execute(|_, _| ()).unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
// Ensure stop_words are effectively stored
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
let stop_words = index.stop_words(&rtxn).unwrap();
|
||||||
|
assert!(stop_words.is_some()); // at this point the index should return something
|
||||||
|
|
||||||
|
let stop_words = stop_words.unwrap();
|
||||||
|
let expected = fst::Set::from_iter(&set).unwrap();
|
||||||
|
assert_eq!(stop_words.as_fst().as_bytes(), expected.as_fst().as_bytes());
|
||||||
|
|
||||||
|
// when we search for something that is a non prefix stop_words it should be ignored
|
||||||
|
let result = index.search(&rtxn).query("the ").execute().unwrap();
|
||||||
|
assert!(result.documents_ids.is_empty());
|
||||||
|
let result = index.search(&rtxn).query("i ").execute().unwrap();
|
||||||
|
assert!(result.documents_ids.is_empty());
|
||||||
|
let result = index.search(&rtxn).query("are ").execute().unwrap();
|
||||||
|
assert!(result.documents_ids.is_empty());
|
||||||
|
|
||||||
|
let result = index.search(&rtxn).query("dog").execute().unwrap();
|
||||||
|
assert_eq!(result.documents_ids.len(), 2); // we have two maxims talking about doggos
|
||||||
|
let result = index.search(&rtxn).query("benoît").execute().unwrap();
|
||||||
|
assert_eq!(result.documents_ids.len(), 1); // there is one benoit in our data
|
||||||
|
|
||||||
|
// now we'll reset the stop_words and ensure it's None
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let mut builder = Settings::new(&mut wtxn, &index, 0);
|
||||||
|
builder.reset_stop_words();
|
||||||
|
builder.execute(|_, _| ()).unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
let stop_words = index.stop_words(&rtxn).unwrap();
|
||||||
|
assert!(stop_words.is_none());
|
||||||
|
|
||||||
|
// now we can search for the stop words
|
||||||
|
let result = index.search(&rtxn).query("the").execute().unwrap();
|
||||||
|
assert_eq!(result.documents_ids.len(), 2);
|
||||||
|
let result = index.search(&rtxn).query("i").execute().unwrap();
|
||||||
|
assert_eq!(result.documents_ids.len(), 1);
|
||||||
|
let result = index.search(&rtxn).query("are").execute().unwrap();
|
||||||
|
assert_eq!(result.documents_ids.len(), 2);
|
||||||
|
|
||||||
|
// the rest of the search is still not impacted
|
||||||
|
let result = index.search(&rtxn).query("dog").execute().unwrap();
|
||||||
|
assert_eq!(result.documents_ids.len(), 2); // we have two maxims talking about doggos
|
||||||
|
let result = index.search(&rtxn).query("benoît").execute().unwrap();
|
||||||
|
assert_eq!(result.documents_ids.len(), 1); // there is one benoit in our data
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@ -519,6 +641,5 @@ mod tests {
|
|||||||
assert_eq!(&["hello"][..], index.displayed_fields(&rtxn).unwrap().unwrap());
|
assert_eq!(&["hello"][..], index.displayed_fields(&rtxn).unwrap().unwrap());
|
||||||
assert!(index.primary_key(&rtxn).unwrap().is_none());
|
assert!(index.primary_key(&rtxn).unwrap().is_none());
|
||||||
assert_eq!(vec![Criterion::Asc("toto".to_string())], index.criteria(&rtxn).unwrap());
|
assert_eq!(vec![Criterion::Asc("toto".to_string())], index.criteria(&rtxn).unwrap());
|
||||||
drop(rtxn);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user