Remove the raw_query functions

2024-11-23 10:37:41 +08:00 · 2019-12-11 15:34:30 +01:00 · 2019-12-11 15:34:30 +01:00 · ea148575cf
commit ea148575cf
parent efc2be0b7b
2 changed files with 6 additions and 349 deletions
--- a/meilisearch-core/src/bucket_sort.rs
+++ b/meilisearch-core/src/bucket_sort.rs
@ -39,7 +39,6 @@ pub fn bucket_sort<'c>(
    synonyms_store: store::Synonyms,
 ) -> MResult<Vec<Document>>
 {
    // let automatons = construct_automatons(query);
    let (automatons, query_enhancer) =
        construct_automatons2(reader, query, main_store, postings_lists_store, synonyms_store)?;
@ -286,14 +285,11 @@ impl<'txn> PostingsListView<'txn> {
    }
    pub fn rewrite_with(&mut self, postings_list: SetBuf<DocIndex>) {
-        *self = match self {
+        let input = match self {
-            PostingsListView::Original { input, .. } => {
+            PostingsListView::Original { input, .. } => input.clone(),
-                PostingsListView::Rewritten { input: input.clone(), postings_list }
+            PostingsListView::Rewritten { input, .. } => input.clone(),
            },
            PostingsListView::Rewritten { input, .. } => {
                PostingsListView::Rewritten { input: input.clone(), postings_list }
            },
        };
        *self = PostingsListView::rewritten(input, postings_list);
    }
    pub fn len(&self) -> usize {
@ -565,7 +561,8 @@ fn construct_automatons2(
                }
            }
-            if true && n == 1 {
+            if n == 1 {
                // automatons for splitted words
                if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? {
                    let mut left_automaton = QueryWordAutomaton::exact(left);
                    left_automaton.phrase_query = Some((0, 2));
--- a/meilisearch-core/src/query_builder.rs
+++ b/meilisearch-core/src/query_builder.rs
@ -399,346 +399,6 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
    }
 }
 fn raw_query<'c, FI>(
    reader: &heed::RoTxn<MainT>,
    query: &str,
    range: Range<usize>,
    filter: Option<FI>,
    timeout: Option<Duration>,
    criteria: Criteria<'c>,
    searchable_attrs: Option<ReorderedAttrs>,
    main_store: store::Main,
    postings_lists_store: store::PostingsLists,
    documents_fields_counts_store: store::DocumentsFieldsCounts,
    synonyms_store: store::Synonyms,
 ) -> MResult<Vec<Document>>
 where
    FI: Fn(DocumentId) -> bool,
 {
    // We delegate the filter work to the distinct query builder,
    // specifying a distinct rule that has no effect.
    if filter.is_some() {
        let distinct = |_| None;
        let distinct_size = 1;
        return raw_query_with_distinct(
            reader,
            query,
            range,
            filter,
            distinct,
            distinct_size,
            timeout,
            criteria,
            searchable_attrs,
            main_store,
            postings_lists_store,
            documents_fields_counts_store,
            synonyms_store,
        );
    }
    let start_processing = Instant::now();
    let mut raw_documents_processed = Vec::with_capacity(range.len());
    let (automaton_producer, query_enhancer) = AutomatonProducer::new(
        reader,
        query,
        main_store,
        postings_lists_store,
        synonyms_store,
    )?;
    let automaton_producer = automaton_producer.into_iter();
    let mut automatons = Vec::new();
    // aggregate automatons groups by groups after time
    for auts in automaton_producer {
        automatons.push(auts);
        for (i, group) in automatons.iter().enumerate() {
            debug!("group {} automatons {:?}", i, group.automatons);
        }
        let before_fetch_raw_documents = Instant::now();
        // we must retrieve the documents associated
        // with the current automatons
        let mut raw_documents = fetch_raw_documents(
            reader,
            &automatons,
            &query_enhancer,
            searchable_attrs.as_ref(),
            main_store,
            postings_lists_store,
        )?;
        debug!("fetch_raw_documents took {:.02?}", before_fetch_raw_documents.elapsed());
        // stop processing when time is running out
        if let Some(timeout) = timeout {
            if !raw_documents_processed.is_empty() && start_processing.elapsed() > timeout {
                break;
            }
        }
        let before_bucket_sort = Instant::now();
        let mut groups = vec![raw_documents.as_mut_slice()];
        'criteria: for criterion in criteria.as_ref() {
            let tmp_groups = mem::replace(&mut groups, Vec::new());
            let mut documents_seen = 0;
            for group in tmp_groups {
                // if this group does not overlap with the requested range,
                // push it without sorting and splitting it
                if documents_seen + group.len() < range.start {
                    documents_seen += group.len();
                    groups.push(group);
                    continue;
                }
                // we must pull the fields counts of these documents
                // TODO it would be great to had a "dependency" thing for each criterion
                //      and make it so that we can be lazy on pulling/computing some data.
                if criterion.name() == "Exact" {
                    for document in group.iter_mut() {
                        let mut fields_counts = Vec::new();
                        for result in documents_fields_counts_store.document_fields_counts(reader, document.id)? {
                            let (attr, count) = result?;
                            fields_counts.push(AttrCount { attr: attr.0, count });
                        }
                        document.fields_counts = Some(SetBuf::new(fields_counts).unwrap());
                    }
                }
                group.sort_unstable_by(|a, b| criterion.evaluate(a, b));
                for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
                    debug!("criterion {} produced a group of size {}", criterion.name(), group.len());
                    documents_seen += group.len();
                    groups.push(group);
                    // we have sort enough documents if the last document sorted is after
                    // the end of the requested range, we can continue to the next criterion
                    if documents_seen >= range.end {
                        continue 'criteria;
                    }
                }
            }
        }
        debug!("bucket_sort took {:.02?}", before_bucket_sort.elapsed());
        // once we classified the documents related to the current
        // automatons we save that as the next valid result
        let iter = raw_documents
            .into_iter()
            .skip(range.start)
            .take(range.len());
        raw_documents_processed.clear();
        raw_documents_processed.extend(iter);
        // stop processing when time is running out
        if let Some(timeout) = timeout {
            if start_processing.elapsed() > timeout {
                break;
            }
        }
    }
    // make real documents now that we know
    // those must be returned
    let documents = raw_documents_processed
        .into_iter()
        .map(Document::from_raw)
        .collect();
    Ok(documents)
 }
 fn raw_query_with_distinct<'c, FI, FD>(
    reader: &heed::RoTxn<MainT>,
    query: &str,
    range: Range<usize>,
    filter: Option<FI>,
    distinct: FD,
    distinct_size: usize,
    timeout: Option<Duration>,
    criteria: Criteria<'c>,
    searchable_attrs: Option<ReorderedAttrs>,
    main_store: store::Main,
    postings_lists_store: store::PostingsLists,
    documents_fields_counts_store: store::DocumentsFieldsCounts,
    synonyms_store: store::Synonyms,
 ) -> MResult<Vec<Document>>
 where
    FI: Fn(DocumentId) -> bool,
    FD: Fn(DocumentId) -> Option<u64>,
 {
    let start_processing = Instant::now();
    let mut raw_documents_processed = Vec::new();
    let (automaton_producer, query_enhancer) = AutomatonProducer::new(
        reader,
        query,
        main_store,
        postings_lists_store,
        synonyms_store,
    )?;
    let automaton_producer = automaton_producer.into_iter();
    let mut automatons = Vec::new();
    // aggregate automatons groups by groups after time
    for auts in automaton_producer {
        automatons.push(auts);
        // we must retrieve the documents associated
        // with the current automatons
        let mut raw_documents = fetch_raw_documents(
            reader,
            &automatons,
            &query_enhancer,
            searchable_attrs.as_ref(),
            main_store,
            postings_lists_store,
        )?;
        // stop processing when time is running out
        if let Some(timeout) = timeout {
            if !raw_documents_processed.is_empty() && start_processing.elapsed() > timeout {
                break;
            }
        }
        let mut groups = vec![raw_documents.as_mut_slice()];
        let mut key_cache = HashMap::new();
        let mut filter_map = HashMap::new();
        // these two variables informs on the current distinct map and
        // on the raw offset of the start of the group where the
        // range.start bound is located according to the distinct function
        let mut distinct_map = DistinctMap::new(distinct_size);
        let mut distinct_raw_offset = 0;
        'criteria: for criterion in criteria.as_ref() {
            let tmp_groups = mem::replace(&mut groups, Vec::new());
            let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map);
            let mut documents_seen = 0;
            for group in tmp_groups {
                // if this group does not overlap with the requested range,
                // push it without sorting and splitting it
                if documents_seen + group.len() < distinct_raw_offset {
                    documents_seen += group.len();
                    groups.push(group);
                    continue;
                }
                group.sort_unstable_by(|a, b| criterion.evaluate(a, b));
                for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
                    // we must compute the real distinguished len of this sub-group
                    for document in group.iter() {
                        let filter_accepted = match &filter {
                            Some(filter) => {
                                let entry = filter_map.entry(document.id);
                                *entry.or_insert_with(|| (filter)(document.id))
                            }
                            None => true,
                        };
                        if filter_accepted {
                            let entry = key_cache.entry(document.id);
                            let key = entry.or_insert_with(|| (distinct)(document.id).map(Rc::new));
                            match key.clone() {
                                Some(key) => buf_distinct.register(key),
                                None => buf_distinct.register_without_key(),
                            };
                        }
                        // the requested range end is reached: stop computing distinct
                        if buf_distinct.len() >= range.end {
                            break;
                        }
                    }
                    documents_seen += group.len();
                    groups.push(group);
                    // if this sub-group does not overlap with the requested range
                    // we must update the distinct map and its start index
                    if buf_distinct.len() < range.start {
                        buf_distinct.transfert_to_internal();
                        distinct_raw_offset = documents_seen;
                    }
                    // we have sort enough documents if the last document sorted is after
                    // the end of the requested range, we can continue to the next criterion
                    if buf_distinct.len() >= range.end {
                        continue 'criteria;
                    }
                }
            }
        }
        // once we classified the documents related to the current
        // automatons we save that as the next valid result
        let mut seen = BufferedDistinctMap::new(&mut distinct_map);
        raw_documents_processed.clear();
        for document in raw_documents.into_iter().skip(distinct_raw_offset) {
            let filter_accepted = match &filter {
                Some(_) => filter_map.remove(&document.id).unwrap(),
                None => true,
            };
            if filter_accepted {
                let key = key_cache.remove(&document.id).unwrap();
                let distinct_accepted = match key {
                    Some(key) => seen.register(key),
                    None => seen.register_without_key(),
                };
                if distinct_accepted && seen.len() > range.start {
                    raw_documents_processed.push(document);
                    if raw_documents_processed.len() == range.len() {
                        break;
                    }
                }
            }
        }
        // stop processing when time is running out
        if let Some(timeout) = timeout {
            if start_processing.elapsed() > timeout {
                break;
            }
        }
    }
    // make real documents now that we know
    // those must be returned
    let documents = raw_documents_processed
        .into_iter()
        .map(Document::from_raw)
        .collect();
    Ok(documents)
 }
 #[cfg(test)]
 mod tests {
    use super::*;