3824: Changes the way words are counted in the word count DB r=ManyTheFish a=dureuill

# Pull Request

## Related issue

Fixes https://github.com/meilisearch/meilisearch/issues/3823

## What does this PR do?

- Apply offset when parsing query that is consistent with the indexing

### DB breaking changes

- Count the number of words in `field_id_word_count_docids`
- raise limit of word count for storing the entry in the DB from 10 to 30

Co-authored-by: Louis Dureuil <louis@meilisearch.com>
This commit is contained in:
meili-bors[bot] 2023-06-08 13:26:05 +00:00 committed by GitHub
commit 047d22fcb1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 5 additions and 6 deletions

View File

@ -79,7 +79,7 @@ pub fn located_query_terms_from_tokens(
TokenKind::Separator(separator_kind) => { TokenKind::Separator(separator_kind) => {
// add penalty for hard separators // add penalty for hard separators
if let SeparatorKind::Hard = separator_kind { if let SeparatorKind::Hard = separator_kind {
position = position.wrapping_add(1); position = position.wrapping_add(7);
} }
phrase = 'phrase: { phrase = 'phrase: {

View File

@ -1,6 +1,6 @@
use std::collections::HashMap; use std::collections::HashMap;
use std::fs::File; use std::fs::File;
use std::{cmp, io}; use std::io;
use grenad::Sorter; use grenad::Sorter;
@ -54,11 +54,10 @@ pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
} }
for position in read_u32_ne_bytes(value) { for position in read_u32_ne_bytes(value) {
let (field_id, position) = relative_from_absolute_position(position); let (field_id, _) = relative_from_absolute_position(position);
let word_count = position as u32 + 1;
let value = document_fid_wordcount.entry(field_id as FieldId).or_insert(0); let value = document_fid_wordcount.entry(field_id as FieldId).or_insert(0);
*value = cmp::max(*value, word_count); *value += 1;
} }
} }
@ -83,7 +82,7 @@ fn drain_document_fid_wordcount_into_sorter(
let mut key_buffer = Vec::new(); let mut key_buffer = Vec::new();
for (fid, count) in document_fid_wordcount.drain() { for (fid, count) in document_fid_wordcount.drain() {
if count <= 10 { if count <= 30 {
key_buffer.clear(); key_buffer.clear();
key_buffer.extend_from_slice(&fid.to_be_bytes()); key_buffer.extend_from_slice(&fid.to_be_bytes());
key_buffer.push(count as u8); key_buffer.push(count as u8);