Fix the minimum proximity len

This commit is contained in:
Kerollmops 2020-09-06 09:54:03 +02:00 committed by Clément Renault
parent 1c504471d3
commit 4ca9472e02
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
2 changed files with 10 additions and 11 deletions

View File

@ -10,7 +10,6 @@ use std::time::Instant;
use anyhow::Context; use anyhow::Context;
use arc_cache::ArcCache; use arc_cache::ArcCache;
use bstr::ByteSlice as _; use bstr::ByteSlice as _;
use cow_utils::CowUtils;
use csv::StringRecord; use csv::StringRecord;
use flate2::read::GzDecoder; use flate2::read::GzDecoder;
use fst::IntoStreamer; use fst::IntoStreamer;

View File

@ -3,14 +3,14 @@ use std::collections::{HashMap, HashSet};
use fst::{IntoStreamer, Streamer}; use fst::{IntoStreamer, Streamer};
use levenshtein_automata::DFA; use levenshtein_automata::DFA;
use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder; use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder;
use log::debug; use log::{debug, error};
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use roaring::bitmap::{IntoIter, RoaringBitmap}; use roaring::bitmap::{IntoIter, RoaringBitmap};
use near_proximity::near_proximity; use near_proximity::near_proximity;
use crate::query_tokens::{QueryTokens, QueryToken}; use crate::query_tokens::{QueryTokens, QueryToken};
use crate::{Index, DocumentId, Position, Attribute}; use crate::{Index, DocumentId};
// Building these factories is not free. // Building these factories is not free.
static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true)); static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true));
@ -118,10 +118,8 @@ impl<'a> Search<'a> {
/// Returns the set of docids that contains all of the query words. /// Returns the set of docids that contains all of the query words.
fn compute_candidates( fn compute_candidates(
rtxn: &heed::RoTxn,
index: &Index,
derived_words: &[(HashMap<String, (u8, RoaringBitmap)>, RoaringBitmap)], derived_words: &[(HashMap<String, (u8, RoaringBitmap)>, RoaringBitmap)],
) -> anyhow::Result<RoaringBitmap> ) -> RoaringBitmap
{ {
// we do a union between all the docids of each of the derived words, // we do a union between all the docids of each of the derived words,
// we got N unions (the number of original query words), we then intersect them. // we got N unions (the number of original query words), we then intersect them.
@ -135,7 +133,7 @@ impl<'a> Search<'a> {
} }
} }
Ok(candidates) candidates
} }
fn fecth_keywords( fn fecth_keywords(
@ -153,8 +151,10 @@ impl<'a> Search<'a> {
for (word, (_distance, docids)) in words { for (word, (_distance, docids)) in words {
if docids.contains(candidate) { if docids.contains(candidate) {
let positions = index.word_docid_positions.get(rtxn, &(word, candidate))?.unwrap(); match index.word_docid_positions.get(rtxn, &(word, candidate))? {
union_positions.union_with(&positions); Some(positions) => union_positions.union_with(&positions),
None => error!("position missing for candidate {} and word {}", candidate, word),
}
} }
} }
keywords.push(union_positions.into_iter()); keywords.push(union_positions.into_iter());
@ -185,13 +185,13 @@ impl<'a> Search<'a> {
} }
let derived_words = Self::fetch_words_docids(rtxn, index, &fst, dfas)?; let derived_words = Self::fetch_words_docids(rtxn, index, &fst, dfas)?;
let candidates = Self::compute_candidates(rtxn, index, &derived_words)?; let candidates = Self::compute_candidates(&derived_words);
debug!("candidates: {:?}", candidates); debug!("candidates: {:?}", candidates);
let mut documents = Vec::new(); let mut documents = Vec::new();
let min_proximity = derived_words.len() as u32; let min_proximity = derived_words.len() as u32 - 1;
let mut number_min_proximity = 0; let mut number_min_proximity = 0;
let mut paths = Vec::new(); let mut paths = Vec::new();