mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-27 04:25:06 +08:00
Merge pull request #594 from meilisearch/fix-stop-words
Fixes the stop words and words fst generation
This commit is contained in:
commit
6db6b40659
@ -6,4 +6,5 @@
|
|||||||
- Add support of nested null, boolean and seq values (#571 and #568, #574)
|
- Add support of nested null, boolean and seq values (#571 and #568, #574)
|
||||||
- Fixed the core benchmark (#576)
|
- Fixed the core benchmark (#576)
|
||||||
- Publish an ARMv7 and ARMv8 binaries on releases (#540 and #581)
|
- Publish an ARMv7 and ARMv8 binaries on releases (#540 and #581)
|
||||||
- Fixing a bug where the result of the update status after the first update was empty (#542)
|
- Fixed a bug where the result of the update status after the first update was empty (#542)
|
||||||
|
- Fixed a bug where stop words were not handled correctly (#594)
|
||||||
|
@ -135,52 +135,61 @@ pub fn apply_stop_words_update(
|
|||||||
writer: &mut heed::RwTxn<MainT>,
|
writer: &mut heed::RwTxn<MainT>,
|
||||||
index: &store::Index,
|
index: &store::Index,
|
||||||
stop_words: BTreeSet<String>,
|
stop_words: BTreeSet<String>,
|
||||||
) -> MResult<bool> {
|
) -> MResult<bool>
|
||||||
|
{
|
||||||
|
let mut must_reindex = false;
|
||||||
|
|
||||||
let old_stop_words: BTreeSet<String> = index.main
|
let old_stop_words: BTreeSet<String> = index.main
|
||||||
.stop_words_fst(writer)?
|
.stop_words_fst(writer)?
|
||||||
.unwrap_or_default()
|
.unwrap_or_default()
|
||||||
.stream()
|
.stream()
|
||||||
.into_strs().unwrap().into_iter().collect();
|
.into_strs()?
|
||||||
|
.into_iter()
|
||||||
|
.collect();
|
||||||
|
|
||||||
let deletion: BTreeSet<String> = old_stop_words.difference(&stop_words).cloned().collect();
|
let deletion: BTreeSet<String> = old_stop_words.difference(&stop_words).cloned().collect();
|
||||||
let addition: BTreeSet<String> = stop_words.difference(&old_stop_words).cloned().collect();
|
let addition: BTreeSet<String> = stop_words.difference(&old_stop_words).cloned().collect();
|
||||||
|
|
||||||
if !addition.is_empty() {
|
if !addition.is_empty() {
|
||||||
apply_stop_words_addition(
|
apply_stop_words_addition(writer, index, addition)?;
|
||||||
writer,
|
|
||||||
index,
|
|
||||||
addition
|
|
||||||
)?;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if !deletion.is_empty() {
|
if !deletion.is_empty() {
|
||||||
apply_stop_words_deletion(
|
must_reindex = true;
|
||||||
writer,
|
apply_stop_words_deletion(writer, index, deletion)?;
|
||||||
index,
|
|
||||||
deletion
|
|
||||||
)?;
|
|
||||||
return Ok(true)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let stop_words_fst = fst::Set::from_iter(stop_words)?;
|
if let Some(words_fst) = index.main.words_fst(writer)? {
|
||||||
index.main.put_words_fst(writer, &stop_words_fst)?;
|
let stop_words = fst::Set::from_iter(stop_words)?;
|
||||||
Ok(false)
|
let op = OpBuilder::new()
|
||||||
|
.add(&words_fst)
|
||||||
|
.add(&stop_words)
|
||||||
|
.difference();
|
||||||
|
|
||||||
|
let mut builder = fst::SetBuilder::memory();
|
||||||
|
builder.extend_stream(op)?;
|
||||||
|
let words_fst = builder.into_inner().and_then(fst::Set::from_bytes)?;
|
||||||
|
|
||||||
|
index.main.put_words_fst(writer, &words_fst)?;
|
||||||
|
index.main.put_stop_words_fst(writer, &stop_words)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(must_reindex)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn apply_stop_words_addition(
|
fn apply_stop_words_addition(
|
||||||
writer: &mut heed::RwTxn<MainT>,
|
writer: &mut heed::RwTxn<MainT>,
|
||||||
index: &store::Index,
|
index: &store::Index,
|
||||||
addition: BTreeSet<String>,
|
addition: BTreeSet<String>,
|
||||||
) -> MResult<()> {
|
) -> MResult<()>
|
||||||
|
{
|
||||||
let main_store = index.main;
|
let main_store = index.main;
|
||||||
let postings_lists_store = index.postings_lists;
|
let postings_lists_store = index.postings_lists;
|
||||||
|
|
||||||
let mut stop_words_builder = SetBuilder::memory();
|
let mut stop_words_builder = SetBuilder::memory();
|
||||||
|
|
||||||
for word in addition {
|
for word in addition {
|
||||||
stop_words_builder.insert(&word).unwrap();
|
stop_words_builder.insert(&word)?;
|
||||||
// we remove every posting list associated to a new stop word
|
// we remove every posting list associated to a new stop word
|
||||||
postings_lists_store.del_postings_list(writer, word.as_bytes())?;
|
postings_lists_store.del_postings_list(writer, word.as_bytes())?;
|
||||||
}
|
}
|
||||||
@ -188,8 +197,7 @@ fn apply_stop_words_addition(
|
|||||||
// create the new delta stop words fst
|
// create the new delta stop words fst
|
||||||
let delta_stop_words = stop_words_builder
|
let delta_stop_words = stop_words_builder
|
||||||
.into_inner()
|
.into_inner()
|
||||||
.and_then(fst::Set::from_bytes)
|
.and_then(fst::Set::from_bytes)?;
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
// we also need to remove all the stop words from the main fst
|
// we also need to remove all the stop words from the main fst
|
||||||
if let Some(word_fst) = main_store.words_fst(writer)? {
|
if let Some(word_fst) = main_store.words_fst(writer)? {
|
||||||
@ -199,11 +207,10 @@ fn apply_stop_words_addition(
|
|||||||
.difference();
|
.difference();
|
||||||
|
|
||||||
let mut word_fst_builder = SetBuilder::memory();
|
let mut word_fst_builder = SetBuilder::memory();
|
||||||
word_fst_builder.extend_stream(op).unwrap();
|
word_fst_builder.extend_stream(op)?;
|
||||||
let word_fst = word_fst_builder
|
let word_fst = word_fst_builder
|
||||||
.into_inner()
|
.into_inner()
|
||||||
.and_then(fst::Set::from_bytes)
|
.and_then(fst::Set::from_bytes)?;
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
main_store.put_words_fst(writer, &word_fst)?;
|
main_store.put_words_fst(writer, &word_fst)?;
|
||||||
}
|
}
|
||||||
@ -217,11 +224,10 @@ fn apply_stop_words_addition(
|
|||||||
.r#union();
|
.r#union();
|
||||||
|
|
||||||
let mut stop_words_builder = SetBuilder::memory();
|
let mut stop_words_builder = SetBuilder::memory();
|
||||||
stop_words_builder.extend_stream(op).unwrap();
|
stop_words_builder.extend_stream(op)?;
|
||||||
let stop_words_fst = stop_words_builder
|
let stop_words_fst = stop_words_builder
|
||||||
.into_inner()
|
.into_inner()
|
||||||
.and_then(fst::Set::from_bytes)
|
.and_then(fst::Set::from_bytes)?;
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
main_store.put_stop_words_fst(writer, &stop_words_fst)?;
|
main_store.put_stop_words_fst(writer, &stop_words_fst)?;
|
||||||
|
|
||||||
@ -237,14 +243,13 @@ fn apply_stop_words_deletion(
|
|||||||
let mut stop_words_builder = SetBuilder::memory();
|
let mut stop_words_builder = SetBuilder::memory();
|
||||||
|
|
||||||
for word in deletion {
|
for word in deletion {
|
||||||
stop_words_builder.insert(&word).unwrap();
|
stop_words_builder.insert(&word)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// create the new delta stop words fst
|
// create the new delta stop words fst
|
||||||
let delta_stop_words = stop_words_builder
|
let delta_stop_words = stop_words_builder
|
||||||
.into_inner()
|
.into_inner()
|
||||||
.and_then(fst::Set::from_bytes)
|
.and_then(fst::Set::from_bytes)?;
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
// now we delete all of these stop words from the main store
|
// now we delete all of these stop words from the main store
|
||||||
let stop_words_fst = index.main.stop_words_fst(writer)?.unwrap_or_default();
|
let stop_words_fst = index.main.stop_words_fst(writer)?.unwrap_or_default();
|
||||||
@ -255,11 +260,8 @@ fn apply_stop_words_deletion(
|
|||||||
.difference();
|
.difference();
|
||||||
|
|
||||||
let mut stop_words_builder = SetBuilder::memory();
|
let mut stop_words_builder = SetBuilder::memory();
|
||||||
stop_words_builder.extend_stream(op).unwrap();
|
stop_words_builder.extend_stream(op)?;
|
||||||
let stop_words_fst = stop_words_builder
|
let stop_words_fst = stop_words_builder.into_inner().and_then(fst::Set::from_bytes)?;
|
||||||
.into_inner()
|
|
||||||
.and_then(fst::Set::from_bytes)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
Ok(index.main.put_stop_words_fst(writer, &stop_words_fst)?)
|
Ok(index.main.put_stop_words_fst(writer, &stop_words_fst)?)
|
||||||
}
|
}
|
||||||
@ -276,14 +278,14 @@ pub fn apply_synonyms_update(
|
|||||||
let mut synonyms_builder = SetBuilder::memory();
|
let mut synonyms_builder = SetBuilder::memory();
|
||||||
synonyms_store.clear(writer)?;
|
synonyms_store.clear(writer)?;
|
||||||
for (word, alternatives) in synonyms.clone() {
|
for (word, alternatives) in synonyms.clone() {
|
||||||
synonyms_builder.insert(&word).unwrap();
|
synonyms_builder.insert(&word)?;
|
||||||
|
|
||||||
let alternatives = {
|
let alternatives = {
|
||||||
let alternatives = SetBuf::from_dirty(alternatives);
|
let alternatives = SetBuf::from_dirty(alternatives);
|
||||||
let mut alternatives_builder = SetBuilder::memory();
|
let mut alternatives_builder = SetBuilder::memory();
|
||||||
alternatives_builder.extend_iter(alternatives).unwrap();
|
alternatives_builder.extend_iter(alternatives)?;
|
||||||
let bytes = alternatives_builder.into_inner().unwrap();
|
let bytes = alternatives_builder.into_inner()?;
|
||||||
fst::Set::from_bytes(bytes).unwrap()
|
fst::Set::from_bytes(bytes)?
|
||||||
};
|
};
|
||||||
|
|
||||||
synonyms_store.put_synonyms(writer, word.as_bytes(), &alternatives)?;
|
synonyms_store.put_synonyms(writer, word.as_bytes(), &alternatives)?;
|
||||||
@ -291,8 +293,7 @@ pub fn apply_synonyms_update(
|
|||||||
|
|
||||||
let synonyms_set = synonyms_builder
|
let synonyms_set = synonyms_builder
|
||||||
.into_inner()
|
.into_inner()
|
||||||
.and_then(fst::Set::from_bytes)
|
.and_then(fst::Set::from_bytes)?;
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
main_store.put_synonyms_fst(writer, &synonyms_set)?;
|
main_store.put_synonyms_fst(writer, &synonyms_set)?;
|
||||||
|
|
||||||
|
@ -6,11 +6,7 @@ mod common;
|
|||||||
#[test]
|
#[test]
|
||||||
fn update_stop_words() {
|
fn update_stop_words() {
|
||||||
let mut server = common::Server::with_uid("movies");
|
let mut server = common::Server::with_uid("movies");
|
||||||
let body = json!({
|
server.populate_movies();
|
||||||
"uid": "movies",
|
|
||||||
"primaryKey": "id",
|
|
||||||
});
|
|
||||||
server.create_index(body);
|
|
||||||
|
|
||||||
// 1 - Get stop words
|
// 1 - Get stop words
|
||||||
|
|
||||||
@ -36,3 +32,32 @@ fn update_stop_words() {
|
|||||||
let (response, _status_code) = server.get_stop_words();
|
let (response, _status_code) = server.get_stop_words();
|
||||||
assert_eq!(response.as_array().unwrap().is_empty(), true);
|
assert_eq!(response.as_array().unwrap().is_empty(), true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn add_documents_and_stop_words() {
|
||||||
|
let mut server = common::Server::with_uid("movies");
|
||||||
|
server.populate_movies();
|
||||||
|
|
||||||
|
// 2 - Update stop words
|
||||||
|
|
||||||
|
let body = json!(["the", "of"]);
|
||||||
|
server.update_stop_words(body.clone());
|
||||||
|
|
||||||
|
// 3 - Search for a document with stop words
|
||||||
|
|
||||||
|
let (response, _status_code) = server.search("q=the%20mask");
|
||||||
|
assert!(!response["hits"].as_array().unwrap().is_empty());
|
||||||
|
|
||||||
|
// 4 - Search for documents with *only* stop words
|
||||||
|
|
||||||
|
let (response, _status_code) = server.search("q=the%20of");
|
||||||
|
assert!(response["hits"].as_array().unwrap().is_empty());
|
||||||
|
|
||||||
|
// 5 - Delete all stop words
|
||||||
|
|
||||||
|
// server.delete_stop_words();
|
||||||
|
|
||||||
|
// // 6 - Search for a document with one stop word
|
||||||
|
|
||||||
|
// assert!(!response["hits"].as_array().unwrap().is_empty());
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user