From c15c076da95a87db9fb62caff4308427cdfe4824 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 8 Jun 2023 11:30:35 +0200 Subject: [PATCH 1/3] DB BREAKING: Count the number of words in field_id_word_count_docids --- .../extract/extract_fid_word_count_docids.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs index 315ebdf0c..6952eb484 100644 --- a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs +++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs @@ -1,6 +1,6 @@ use std::collections::HashMap; use std::fs::File; -use std::{cmp, io}; +use std::io; use grenad::Sorter; @@ -54,11 +54,10 @@ pub fn extract_fid_word_count_docids( } for position in read_u32_ne_bytes(value) { - let (field_id, position) = relative_from_absolute_position(position); - let word_count = position as u32 + 1; + let (field_id, _) = relative_from_absolute_position(position); let value = document_fid_wordcount.entry(field_id as FieldId).or_insert(0); - *value = cmp::max(*value, word_count); + *value += 1; } } From 9f37b61666363bf87175ad8722b1cd3929fd0160 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 8 Jun 2023 11:31:38 +0200 Subject: [PATCH 2/3] DB BREAKING: raise limit of word count from 10 to 30. --- .../index_documents/extract/extract_fid_word_count_docids.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs index 6952eb484..fe8eb93ed 100644 --- a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs +++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs @@ -82,7 +82,7 @@ fn drain_document_fid_wordcount_into_sorter( let mut key_buffer = Vec::new(); for (fid, count) in document_fid_wordcount.drain() { - if count <= 10 { + if count <= 30 { key_buffer.clear(); key_buffer.extend_from_slice(&fid.to_be_bytes()); key_buffer.push(count as u8); From a2a3b8c9739e6d04dc8e4fbd4f31afff4163fd4b Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 6 Jun 2023 15:08:13 +0200 Subject: [PATCH 3/3] Fix offset difference between query and indexing for hard separators --- milli/src/search/new/query_term/parse_query.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs index 69c2cd9c9..6f146b208 100644 --- a/milli/src/search/new/query_term/parse_query.rs +++ b/milli/src/search/new/query_term/parse_query.rs @@ -79,7 +79,7 @@ pub fn located_query_terms_from_tokens( TokenKind::Separator(separator_kind) => { // add penalty for hard separators if let SeparatorKind::Hard = separator_kind { - position = position.wrapping_add(1); + position = position.wrapping_add(7); } phrase = 'phrase: {