make the attribute ranking rule use the weights and fix the tests

This commit is contained in:
Tamo 2024-05-14 16:56:08 +02:00
parent a0082c4df9
commit caa6a7149a
7 changed files with 306 additions and 36 deletions

View File

@ -85,6 +85,9 @@ impl SearchQueue {
}, },
search_request = receive_new_searches.recv() => { search_request = receive_new_searches.recv() => {
if search_request.is_none() {
continue;
}
// this unwrap is safe because we're sure the `SearchQueue` still lives somewhere in actix-web // this unwrap is safe because we're sure the `SearchQueue` still lives somewhere in actix-web
let search_request = search_request.unwrap(); let search_request = search_request.unwrap();
if searches_running < usize::from(parallelism) && queue.is_empty() { if searches_running < usize::from(parallelism) && queue.is_empty() {

View File

@ -2722,8 +2722,8 @@ pub(crate) mod tests {
// We should find tamo first // We should find tamo first
insta::assert_debug_snapshot!(results.documents_ids, @r###" insta::assert_debug_snapshot!(results.documents_ids, @r###"
[ [
0,
1, 1,
0,
] ]
"###); "###);
} }

View File

@ -76,7 +76,7 @@ impl<'ctx> SearchContext<'ctx> {
let mut exact = Vec::new(); let mut exact = Vec::new();
let mut tolerant = Vec::new(); let mut tolerant = Vec::new();
for (name, fid, weight) in searchable_fids { for (_name, fid, weight) in searchable_fids {
if exact_attributes_ids.contains(&fid) { if exact_attributes_ids.contains(&fid) {
exact.push((fid, weight)); exact.push((fid, weight));
} else { } else {
@ -96,22 +96,26 @@ impl<'ctx> SearchContext<'ctx> {
}) })
} }
// TODO: TAMO continue here
pub fn searchable_attributes(&mut self, attributes_to_search_on: &'ctx [String]) -> Result<()> { pub fn searchable_attributes(&mut self, attributes_to_search_on: &'ctx [String]) -> Result<()> {
if attributes_to_search_on.contains(&String::from("*")) { let user_defined_searchable = self.index.user_defined_searchable_fields(self.txn)?;
return Ok(());
}
let fids_map = self.index.fields_ids_map(self.txn)?;
let searchable_names = self.index.searchable_fields_and_weights(self.txn)?; let searchable_names = self.index.searchable_fields_and_weights(self.txn)?;
let exact_attributes_ids = self.index.exact_attributes_ids(self.txn)?; let exact_attributes_ids = self.index.exact_attributes_ids(self.txn)?;
let mut wildcard = false;
let mut restricted_fids = SearchableFids::default(); let mut restricted_fids = SearchableFids::default();
for field_name in attributes_to_search_on { for field_name in attributes_to_search_on {
if field_name == "*" {
wildcard = true;
// we cannot early exit as we want to returns error in case of unknown fields
continue;
}
let searchable_weight = searchable_names.iter().find(|(name, _, _)| name == field_name); let searchable_weight = searchable_names.iter().find(|(name, _, _)| name == field_name);
let (fid, weight) = match searchable_weight { let (fid, weight) = match searchable_weight {
// The Field id exist and the field is searchable // The Field id exist and the field is searchable
Some((_name, fid, weight)) => (*fid, *weight), Some((_name, fid, weight)) => (*fid, *weight),
// The field is not searchable but the user didn't define any searchable attributes
None if user_defined_searchable.is_none() => continue,
// The field is not searchable => User error // The field is not searchable => User error
None => { None => {
let (valid_fields, hidden_fields) = self.index.remove_hidden_fields( let (valid_fields, hidden_fields) = self.index.remove_hidden_fields(
@ -136,7 +140,16 @@ impl<'ctx> SearchContext<'ctx> {
}; };
} }
if wildcard {
let (exact, tolerant) = searchable_names
.iter()
.map(|(_name, fid, weight)| (*fid, *weight))
.partition(|(fid, _weight)| exact_attributes_ids.contains(fid));
self.searchable_fids = SearchableFids { tolerant, exact };
} else {
self.searchable_fids = restricted_fids; self.searchable_fids = restricted_fids;
}
Ok(()) Ok(())
} }

View File

@ -7,12 +7,12 @@ use crate::search::new::interner::{DedupInterner, Interned};
use crate::search::new::query_term::LocatedQueryTermSubset; use crate::search::new::query_term::LocatedQueryTermSubset;
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids_within_field_id; use crate::search::new::resolve_query_graph::compute_query_term_subset_docids_within_field_id;
use crate::search::new::SearchContext; use crate::search::new::SearchContext;
use crate::Result; use crate::{FieldId, Result};
#[derive(Clone, PartialEq, Eq, Hash)] #[derive(Clone, PartialEq, Eq, Hash)]
pub struct FidCondition { pub struct FidCondition {
term: LocatedQueryTermSubset, term: LocatedQueryTermSubset,
fid: u16, fid: Option<FieldId>,
} }
pub enum FidGraph {} pub enum FidGraph {}
@ -26,13 +26,16 @@ impl RankingRuleGraphTrait for FidGraph {
universe: &RoaringBitmap, universe: &RoaringBitmap,
) -> Result<ComputedCondition> { ) -> Result<ComputedCondition> {
let FidCondition { term, .. } = condition; let FidCondition { term, .. } = condition;
let docids = if let Some(fid) = condition.fid {
// maybe compute_query_term_subset_docids_within_field_id should accept a universe as argument // maybe compute_query_term_subset_docids_within_field_id should accept a universe as argument
let mut docids = compute_query_term_subset_docids_within_field_id( let mut docids =
ctx, compute_query_term_subset_docids_within_field_id(ctx, &term.term_subset, fid)?;
&term.term_subset,
condition.fid,
)?;
docids &= universe; docids &= universe;
docids
} else {
RoaringBitmap::new()
};
Ok(ComputedCondition { Ok(ComputedCondition {
docids, docids,
@ -68,24 +71,27 @@ impl RankingRuleGraphTrait for FidGraph {
all_fields.extend(fields); all_fields.extend(fields);
} }
let weights_map = ctx.index.fieldids_weights_map(ctx.txn)?;
let mut edges = vec![]; let mut edges = vec![];
for fid in all_fields.iter().copied() { for fid in all_fields.iter().copied() {
let weight = weights_map.weight(fid).unwrap();
edges.push(( edges.push((
fid as u32 * term.term_ids.len() as u32, weight as u32 * term.term_ids.len() as u32,
conditions_interner.insert(FidCondition { term: term.clone(), fid }), conditions_interner.insert(FidCondition { term: term.clone(), fid: Some(fid) }),
)); ));
} }
// always lookup the max_fid if we don't already and add an artificial condition for max scoring // always lookup the max_fid if we don't already and add an artificial condition for max scoring
let max_fid: Option<u16> = ctx.index.searchable_fields_ids(ctx.txn)?.into_iter().max(); let max_weight: Option<u16> = weights_map.max_weight();
if let Some(max_fid) = max_fid { if let Some(max_weight) = max_weight {
if !all_fields.contains(&max_fid) { if !all_fields.contains(&max_weight) {
edges.push(( edges.push((
max_fid as u32 * term.term_ids.len() as u32, // TODO improve the fid score i.e. fid^10. max_weight as u32 * term.term_ids.len() as u32, // TODO improve the fid score i.e. fid^10.
conditions_interner.insert(FidCondition { conditions_interner.insert(FidCondition {
term: term.clone(), // TODO remove this ugly clone term: term.clone(), // TODO remove this ugly clone
fid: max_fid, fid: None,
}), }),
)); ));
} }

View File

@ -132,17 +132,17 @@ fn test_attribute_fid_simple() {
fn test_attribute_fid_ngrams() { fn test_attribute_fid_ngrams() {
let index = create_index(); let index = create_index();
db_snap!(index, fields_ids_map, @r###" db_snap!(index, fields_ids_map, @r###"
0 title | 0 id |
1 description | 1 title |
2 plot | 2 description |
3 id | 3 plot |
"###); "###);
db_snap!(index, searchable_fields, @r###"["title", "description", "plot"]"###); db_snap!(index, searchable_fields, @r###"["title", "description", "plot"]"###);
db_snap!(index, fieldids_weights_map, @r###" db_snap!(index, fieldids_weights_map, @r###"
fid weight fid weight
0 0 | 1 0 |
1 1 | 2 1 |
2 2 | 3 2 |
"###); "###);
let txn = index.read_txn().unwrap(); let txn = index.read_txn().unwrap();

View File

@ -0,0 +1,244 @@
---
source: milli/src/search/new/tests/attribute_fid.rs
expression: "format!(\"{document_ids_scores:#?}\")"
---
[
(
2,
[
Fid(
Rank {
rank: 19,
max_rank: 19,
},
),
Position(
Rank {
rank: 91,
max_rank: 91,
},
),
],
),
(
6,
[
Fid(
Rank {
rank: 15,
max_rank: 19,
},
),
Position(
Rank {
rank: 81,
max_rank: 91,
},
),
],
),
(
5,
[
Fid(
Rank {
rank: 14,
max_rank: 19,
},
),
Position(
Rank {
rank: 79,
max_rank: 91,
},
),
],
),
(
4,
[
Fid(
Rank {
rank: 13,
max_rank: 19,
},
),
Position(
Rank {
rank: 77,
max_rank: 91,
},
),
],
),
(
3,
[
Fid(
Rank {
rank: 12,
max_rank: 19,
},
),
Position(
Rank {
rank: 83,
max_rank: 91,
},
),
],
),
(
9,
[
Fid(
Rank {
rank: 11,
max_rank: 19,
},
),
Position(
Rank {
rank: 75,
max_rank: 91,
},
),
],
),
(
8,
[
Fid(
Rank {
rank: 10,
max_rank: 19,
},
),
Position(
Rank {
rank: 79,
max_rank: 91,
},
),
],
),
(
7,
[
Fid(
Rank {
rank: 10,
max_rank: 19,
},
),
Position(
Rank {
rank: 73,
max_rank: 91,
},
),
],
),
(
11,
[
Fid(
Rank {
rank: 7,
max_rank: 19,
},
),
Position(
Rank {
rank: 77,
max_rank: 91,
},
),
],
),
(
10,
[
Fid(
Rank {
rank: 6,
max_rank: 19,
},
),
Position(
Rank {
rank: 81,
max_rank: 91,
},
),
],
),
(
13,
[
Fid(
Rank {
rank: 6,
max_rank: 19,
},
),
Position(
Rank {
rank: 81,
max_rank: 91,
},
),
],
),
(
12,
[
Fid(
Rank {
rank: 6,
max_rank: 19,
},
),
Position(
Rank {
rank: 78,
max_rank: 91,
},
),
],
),
(
14,
[
Fid(
Rank {
rank: 5,
max_rank: 19,
},
),
Position(
Rank {
rank: 75,
max_rank: 91,
},
),
],
),
(
0,
[
Fid(
Rank {
rank: 1,
max_rank: 19,
},
),
Position(
Rank {
rank: 91,
max_rank: 91,
},
),
],
),
]

View File

@ -12,7 +12,6 @@ use time::OffsetDateTime;
use super::index_documents::{IndexDocumentsConfig, Transform}; use super::index_documents::{IndexDocumentsConfig, Transform};
use super::IndexerConfig; use super::IndexerConfig;
use crate::criterion::Criterion; use crate::criterion::Criterion;
use crate::documents::FieldIdMapper;
use crate::error::UserError; use crate::error::UserError;
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
use crate::order_by_map::OrderByMap; use crate::order_by_map::OrderByMap;
@ -1562,8 +1561,9 @@ mod tests {
// we must find the appropriate document. // we must find the appropriate document.
let result = index.search(&rtxn).query(r#""kevin""#).execute().unwrap(); let result = index.search(&rtxn).query(r#""kevin""#).execute().unwrap();
let documents = index.documents(&rtxn, result.documents_ids).unwrap(); let documents = index.documents(&rtxn, result.documents_ids).unwrap();
let fid_map = index.fields_ids_map(&rtxn).unwrap();
assert_eq!(documents.len(), 1); assert_eq!(documents.len(), 1);
assert_eq!(documents[0].1.get(0), Some(&br#""kevin""#[..])); assert_eq!(documents[0].1.get(fid_map.id("name").unwrap()), Some(&br#""kevin""#[..]));
drop(rtxn); drop(rtxn);
// We change the searchable fields to be the "name" field only. // We change the searchable fields to be the "name" field only.
@ -1575,12 +1575,16 @@ mod tests {
// Check that the searchable field have been reset and documents are found now. // Check that the searchable field have been reset and documents are found now.
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
let fid_map = index.fields_ids_map(&rtxn).unwrap();
let user_defined_searchable_fields = index.user_defined_searchable_fields(&rtxn).unwrap();
snapshot!(format!("{user_defined_searchable_fields:?}"), @"None");
// the searchable fields should contain all the fields
let searchable_fields = index.searchable_fields(&rtxn).unwrap(); let searchable_fields = index.searchable_fields(&rtxn).unwrap();
snapshot!(format!("{searchable_fields:?}"), @r###"["name", "id", "age"]"###); snapshot!(format!("{searchable_fields:?}"), @r###"["id", "name", "age"]"###);
let result = index.search(&rtxn).query("23").execute().unwrap(); let result = index.search(&rtxn).query("23").execute().unwrap();
assert_eq!(result.documents_ids.len(), 1); assert_eq!(result.documents_ids.len(), 1);
let documents = index.documents(&rtxn, result.documents_ids).unwrap(); let documents = index.documents(&rtxn, result.documents_ids).unwrap();
assert_eq!(documents[0].1.get(0), Some(&br#""kevin""#[..])); assert_eq!(documents[0].1.get(fid_map.id("name").unwrap()), Some(&br#""kevin""#[..]));
} }
#[test] #[test]