Merge pull request #594 from meilisearch/fix-stop-words

Fixes the stop words and words fst generation
This commit is contained in:
Clément Renault 2020-04-07 11:06:39 +02:00 committed by GitHub
commit 6db6b40659
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 74 additions and 47 deletions

View File

@ -6,4 +6,5 @@
- Add support of nested null, boolean and seq values (#571 and #568, #574) - Add support of nested null, boolean and seq values (#571 and #568, #574)
- Fixed the core benchmark (#576) - Fixed the core benchmark (#576)
- Publish an ARMv7 and ARMv8 binaries on releases (#540 and #581) - Publish an ARMv7 and ARMv8 binaries on releases (#540 and #581)
- Fixing a bug where the result of the update status after the first update was empty (#542) - Fixed a bug where the result of the update status after the first update was empty (#542)
- Fixed a bug where stop words were not handled correctly (#594)

View File

@ -135,52 +135,61 @@ pub fn apply_stop_words_update(
writer: &mut heed::RwTxn<MainT>, writer: &mut heed::RwTxn<MainT>,
index: &store::Index, index: &store::Index,
stop_words: BTreeSet<String>, stop_words: BTreeSet<String>,
) -> MResult<bool> { ) -> MResult<bool>
{
let mut must_reindex = false;
let old_stop_words: BTreeSet<String> = index.main let old_stop_words: BTreeSet<String> = index.main
.stop_words_fst(writer)? .stop_words_fst(writer)?
.unwrap_or_default() .unwrap_or_default()
.stream() .stream()
.into_strs().unwrap().into_iter().collect(); .into_strs()?
.into_iter()
.collect();
let deletion: BTreeSet<String> = old_stop_words.difference(&stop_words).cloned().collect(); let deletion: BTreeSet<String> = old_stop_words.difference(&stop_words).cloned().collect();
let addition: BTreeSet<String> = stop_words.difference(&old_stop_words).cloned().collect(); let addition: BTreeSet<String> = stop_words.difference(&old_stop_words).cloned().collect();
if !addition.is_empty() { if !addition.is_empty() {
apply_stop_words_addition( apply_stop_words_addition(writer, index, addition)?;
writer,
index,
addition
)?;
} }
if !deletion.is_empty() { if !deletion.is_empty() {
apply_stop_words_deletion( must_reindex = true;
writer, apply_stop_words_deletion(writer, index, deletion)?;
index,
deletion
)?;
return Ok(true)
} }
let stop_words_fst = fst::Set::from_iter(stop_words)?; if let Some(words_fst) = index.main.words_fst(writer)? {
index.main.put_words_fst(writer, &stop_words_fst)?; let stop_words = fst::Set::from_iter(stop_words)?;
Ok(false) let op = OpBuilder::new()
.add(&words_fst)
.add(&stop_words)
.difference();
let mut builder = fst::SetBuilder::memory();
builder.extend_stream(op)?;
let words_fst = builder.into_inner().and_then(fst::Set::from_bytes)?;
index.main.put_words_fst(writer, &words_fst)?;
index.main.put_stop_words_fst(writer, &stop_words)?;
}
Ok(must_reindex)
} }
fn apply_stop_words_addition( fn apply_stop_words_addition(
writer: &mut heed::RwTxn<MainT>, writer: &mut heed::RwTxn<MainT>,
index: &store::Index, index: &store::Index,
addition: BTreeSet<String>, addition: BTreeSet<String>,
) -> MResult<()> { ) -> MResult<()>
{
let main_store = index.main; let main_store = index.main;
let postings_lists_store = index.postings_lists; let postings_lists_store = index.postings_lists;
let mut stop_words_builder = SetBuilder::memory(); let mut stop_words_builder = SetBuilder::memory();
for word in addition { for word in addition {
stop_words_builder.insert(&word).unwrap(); stop_words_builder.insert(&word)?;
// we remove every posting list associated to a new stop word // we remove every posting list associated to a new stop word
postings_lists_store.del_postings_list(writer, word.as_bytes())?; postings_lists_store.del_postings_list(writer, word.as_bytes())?;
} }
@ -188,8 +197,7 @@ fn apply_stop_words_addition(
// create the new delta stop words fst // create the new delta stop words fst
let delta_stop_words = stop_words_builder let delta_stop_words = stop_words_builder
.into_inner() .into_inner()
.and_then(fst::Set::from_bytes) .and_then(fst::Set::from_bytes)?;
.unwrap();
// we also need to remove all the stop words from the main fst // we also need to remove all the stop words from the main fst
if let Some(word_fst) = main_store.words_fst(writer)? { if let Some(word_fst) = main_store.words_fst(writer)? {
@ -199,11 +207,10 @@ fn apply_stop_words_addition(
.difference(); .difference();
let mut word_fst_builder = SetBuilder::memory(); let mut word_fst_builder = SetBuilder::memory();
word_fst_builder.extend_stream(op).unwrap(); word_fst_builder.extend_stream(op)?;
let word_fst = word_fst_builder let word_fst = word_fst_builder
.into_inner() .into_inner()
.and_then(fst::Set::from_bytes) .and_then(fst::Set::from_bytes)?;
.unwrap();
main_store.put_words_fst(writer, &word_fst)?; main_store.put_words_fst(writer, &word_fst)?;
} }
@ -217,11 +224,10 @@ fn apply_stop_words_addition(
.r#union(); .r#union();
let mut stop_words_builder = SetBuilder::memory(); let mut stop_words_builder = SetBuilder::memory();
stop_words_builder.extend_stream(op).unwrap(); stop_words_builder.extend_stream(op)?;
let stop_words_fst = stop_words_builder let stop_words_fst = stop_words_builder
.into_inner() .into_inner()
.and_then(fst::Set::from_bytes) .and_then(fst::Set::from_bytes)?;
.unwrap();
main_store.put_stop_words_fst(writer, &stop_words_fst)?; main_store.put_stop_words_fst(writer, &stop_words_fst)?;
@ -237,14 +243,13 @@ fn apply_stop_words_deletion(
let mut stop_words_builder = SetBuilder::memory(); let mut stop_words_builder = SetBuilder::memory();
for word in deletion { for word in deletion {
stop_words_builder.insert(&word).unwrap(); stop_words_builder.insert(&word)?;
} }
// create the new delta stop words fst // create the new delta stop words fst
let delta_stop_words = stop_words_builder let delta_stop_words = stop_words_builder
.into_inner() .into_inner()
.and_then(fst::Set::from_bytes) .and_then(fst::Set::from_bytes)?;
.unwrap();
// now we delete all of these stop words from the main store // now we delete all of these stop words from the main store
let stop_words_fst = index.main.stop_words_fst(writer)?.unwrap_or_default(); let stop_words_fst = index.main.stop_words_fst(writer)?.unwrap_or_default();
@ -255,11 +260,8 @@ fn apply_stop_words_deletion(
.difference(); .difference();
let mut stop_words_builder = SetBuilder::memory(); let mut stop_words_builder = SetBuilder::memory();
stop_words_builder.extend_stream(op).unwrap(); stop_words_builder.extend_stream(op)?;
let stop_words_fst = stop_words_builder let stop_words_fst = stop_words_builder.into_inner().and_then(fst::Set::from_bytes)?;
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
Ok(index.main.put_stop_words_fst(writer, &stop_words_fst)?) Ok(index.main.put_stop_words_fst(writer, &stop_words_fst)?)
} }
@ -276,14 +278,14 @@ pub fn apply_synonyms_update(
let mut synonyms_builder = SetBuilder::memory(); let mut synonyms_builder = SetBuilder::memory();
synonyms_store.clear(writer)?; synonyms_store.clear(writer)?;
for (word, alternatives) in synonyms.clone() { for (word, alternatives) in synonyms.clone() {
synonyms_builder.insert(&word).unwrap(); synonyms_builder.insert(&word)?;
let alternatives = { let alternatives = {
let alternatives = SetBuf::from_dirty(alternatives); let alternatives = SetBuf::from_dirty(alternatives);
let mut alternatives_builder = SetBuilder::memory(); let mut alternatives_builder = SetBuilder::memory();
alternatives_builder.extend_iter(alternatives).unwrap(); alternatives_builder.extend_iter(alternatives)?;
let bytes = alternatives_builder.into_inner().unwrap(); let bytes = alternatives_builder.into_inner()?;
fst::Set::from_bytes(bytes).unwrap() fst::Set::from_bytes(bytes)?
}; };
synonyms_store.put_synonyms(writer, word.as_bytes(), &alternatives)?; synonyms_store.put_synonyms(writer, word.as_bytes(), &alternatives)?;
@ -291,8 +293,7 @@ pub fn apply_synonyms_update(
let synonyms_set = synonyms_builder let synonyms_set = synonyms_builder
.into_inner() .into_inner()
.and_then(fst::Set::from_bytes) .and_then(fst::Set::from_bytes)?;
.unwrap();
main_store.put_synonyms_fst(writer, &synonyms_set)?; main_store.put_synonyms_fst(writer, &synonyms_set)?;

View File

@ -6,11 +6,7 @@ mod common;
#[test] #[test]
fn update_stop_words() { fn update_stop_words() {
let mut server = common::Server::with_uid("movies"); let mut server = common::Server::with_uid("movies");
let body = json!({ server.populate_movies();
"uid": "movies",
"primaryKey": "id",
});
server.create_index(body);
// 1 - Get stop words // 1 - Get stop words
@ -36,3 +32,32 @@ fn update_stop_words() {
let (response, _status_code) = server.get_stop_words(); let (response, _status_code) = server.get_stop_words();
assert_eq!(response.as_array().unwrap().is_empty(), true); assert_eq!(response.as_array().unwrap().is_empty(), true);
} }
#[test]
fn add_documents_and_stop_words() {
let mut server = common::Server::with_uid("movies");
server.populate_movies();
// 2 - Update stop words
let body = json!(["the", "of"]);
server.update_stop_words(body.clone());
// 3 - Search for a document with stop words
let (response, _status_code) = server.search("q=the%20mask");
assert!(!response["hits"].as_array().unwrap().is_empty());
// 4 - Search for documents with *only* stop words
let (response, _status_code) = server.search("q=the%20of");
assert!(response["hits"].as_array().unwrap().is_empty());
// 5 - Delete all stop words
// server.delete_stop_words();
// // 6 - Search for a document with one stop word
// assert!(!response["hits"].as_array().unwrap().is_empty());
}