Remove the raw_query functions

2024-11-23 10:37:41 +08:00 · 2019-12-11 15:34:30 +01:00 · 2019-12-11 15:34:30 +01:00 · ea148575cf
commit ea148575cf
parent efc2be0b7b
2 changed files with 6 additions and 349 deletions
--- a/meilisearch-core/src/bucket_sort.rs
+++ b/meilisearch-core/src/bucket_sort.rs
@ -39,7 +39,6 @@ pub fn bucket_sort<'c>(
    synonyms_store: store::Synonyms,
 ) -> MResult<Vec<Document>>
 {
-    // let automatons = construct_automatons(query);
    let (automatons, query_enhancer) =
        construct_automatons2(reader, query, main_store, postings_lists_store, synonyms_store)?;

@ -286,14 +285,11 @@ impl<'txn> PostingsListView<'txn> {
    }

    pub fn rewrite_with(&mut self, postings_list: SetBuf<DocIndex>) {
-        *self = match self {
-            PostingsListView::Original { input, .. } => {
-                PostingsListView::Rewritten { input: input.clone(), postings_list }
-            },
-            PostingsListView::Rewritten { input, .. } => {
-                PostingsListView::Rewritten { input: input.clone(), postings_list }
-            },
+        let input = match self {
+            PostingsListView::Original { input, .. } => input.clone(),
+            PostingsListView::Rewritten { input, .. } => input.clone(),
        };
+        *self = PostingsListView::rewritten(input, postings_list);
    }

    pub fn len(&self) -> usize {
@ -565,7 +561,8 @@ fn construct_automatons2(
                }
            }

-            if true && n == 1 {
+            if n == 1 {
+                // automatons for splitted words
                if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? {
                    let mut left_automaton = QueryWordAutomaton::exact(left);
                    left_automaton.phrase_query = Some((0, 2));
--- a/meilisearch-core/src/query_builder.rs
+++ b/meilisearch-core/src/query_builder.rs
@ -399,346 +399,6 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
    }
 }

-fn raw_query<'c, FI>(
-    reader: &heed::RoTxn<MainT>,
-
-    query: &str,
-    range: Range<usize>,
-
-    filter: Option<FI>,
-    timeout: Option<Duration>,
-
-    criteria: Criteria<'c>,
-    searchable_attrs: Option<ReorderedAttrs>,
-
-    main_store: store::Main,
-    postings_lists_store: store::PostingsLists,
-    documents_fields_counts_store: store::DocumentsFieldsCounts,
-    synonyms_store: store::Synonyms,
-) -> MResult<Vec<Document>>
-where
-    FI: Fn(DocumentId) -> bool,
-{
-    // We delegate the filter work to the distinct query builder,
-    // specifying a distinct rule that has no effect.
-    if filter.is_some() {
-        let distinct = |_| None;
-        let distinct_size = 1;
-        return raw_query_with_distinct(
-            reader,
-            query,
-            range,
-            filter,
-            distinct,
-            distinct_size,
-            timeout,
-            criteria,
-            searchable_attrs,
-            main_store,
-            postings_lists_store,
-            documents_fields_counts_store,
-            synonyms_store,
-        );
-    }
-
-    let start_processing = Instant::now();
-    let mut raw_documents_processed = Vec::with_capacity(range.len());
-
-    let (automaton_producer, query_enhancer) = AutomatonProducer::new(
-        reader,
-        query,
-        main_store,
-        postings_lists_store,
-        synonyms_store,
-    )?;
-
-    let automaton_producer = automaton_producer.into_iter();
-    let mut automatons = Vec::new();
-
-    // aggregate automatons groups by groups after time
-    for auts in automaton_producer {
-        automatons.push(auts);
-
-        for (i, group) in automatons.iter().enumerate() {
-            debug!("group {} automatons {:?}", i, group.automatons);
-        }
-
-        let before_fetch_raw_documents = Instant::now();
-        // we must retrieve the documents associated
-        // with the current automatons
-        let mut raw_documents = fetch_raw_documents(
-            reader,
-            &automatons,
-            &query_enhancer,
-            searchable_attrs.as_ref(),
-            main_store,
-            postings_lists_store,
-        )?;
-        debug!("fetch_raw_documents took {:.02?}", before_fetch_raw_documents.elapsed());
-
-        // stop processing when time is running out
-        if let Some(timeout) = timeout {
-            if !raw_documents_processed.is_empty() && start_processing.elapsed() > timeout {
-                break;
-            }
-        }
-
-        let before_bucket_sort = Instant::now();
-
-        let mut groups = vec![raw_documents.as_mut_slice()];
-
-        'criteria: for criterion in criteria.as_ref() {
-            let tmp_groups = mem::replace(&mut groups, Vec::new());
-            let mut documents_seen = 0;
-
-            for group in tmp_groups {
-                // if this group does not overlap with the requested range,
-                // push it without sorting and splitting it
-                if documents_seen + group.len() < range.start {
-                    documents_seen += group.len();
-                    groups.push(group);
-                    continue;
-                }
-
-                // we must pull the fields counts of these documents
-                // TODO it would be great to had a "dependency" thing for each criterion
-                //      and make it so that we can be lazy on pulling/computing some data.
-                if criterion.name() == "Exact" {
-                    for document in group.iter_mut() {
-                        let mut fields_counts = Vec::new();
-                        for result in documents_fields_counts_store.document_fields_counts(reader, document.id)? {
-                            let (attr, count) = result?;
-                            fields_counts.push(AttrCount { attr: attr.0, count });
-                        }
-                        document.fields_counts = Some(SetBuf::new(fields_counts).unwrap());
-                    }
-                }
-
-
-                group.sort_unstable_by(|a, b| criterion.evaluate(a, b));
-
-                for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
-                    debug!("criterion {} produced a group of size {}", criterion.name(), group.len());
-
-                    documents_seen += group.len();
-                    groups.push(group);
-
-
-                    // we have sort enough documents if the last document sorted is after
-                    // the end of the requested range, we can continue to the next criterion
-                    if documents_seen >= range.end {
-                        continue 'criteria;
-                    }
-                }
-            }
-        }
-
-        debug!("bucket_sort took {:.02?}", before_bucket_sort.elapsed());
-
-        // once we classified the documents related to the current
-        // automatons we save that as the next valid result
-        let iter = raw_documents
-            .into_iter()
-            .skip(range.start)
-            .take(range.len());
-        raw_documents_processed.clear();
-        raw_documents_processed.extend(iter);
-
-        // stop processing when time is running out
-        if let Some(timeout) = timeout {
-            if start_processing.elapsed() > timeout {
-                break;
-            }
-        }
-    }
-
-    // make real documents now that we know
-    // those must be returned
-    let documents = raw_documents_processed
-        .into_iter()
-        .map(Document::from_raw)
-        .collect();
-
-    Ok(documents)
-}
-
-fn raw_query_with_distinct<'c, FI, FD>(
-    reader: &heed::RoTxn<MainT>,
-
-    query: &str,
-    range: Range<usize>,
-
-    filter: Option<FI>,
-
-    distinct: FD,
-    distinct_size: usize,
-    timeout: Option<Duration>,
-
-    criteria: Criteria<'c>,
-    searchable_attrs: Option<ReorderedAttrs>,
-
-    main_store: store::Main,
-    postings_lists_store: store::PostingsLists,
-    documents_fields_counts_store: store::DocumentsFieldsCounts,
-    synonyms_store: store::Synonyms,
-) -> MResult<Vec<Document>>
-where
-    FI: Fn(DocumentId) -> bool,
-    FD: Fn(DocumentId) -> Option<u64>,
-{
-    let start_processing = Instant::now();
-    let mut raw_documents_processed = Vec::new();
-
-    let (automaton_producer, query_enhancer) = AutomatonProducer::new(
-        reader,
-        query,
-        main_store,
-        postings_lists_store,
-        synonyms_store,
-    )?;
-
-    let automaton_producer = automaton_producer.into_iter();
-    let mut automatons = Vec::new();
-
-    // aggregate automatons groups by groups after time
-    for auts in automaton_producer {
-        automatons.push(auts);
-
-        // we must retrieve the documents associated
-        // with the current automatons
-        let mut raw_documents = fetch_raw_documents(
-            reader,
-            &automatons,
-            &query_enhancer,
-            searchable_attrs.as_ref(),
-            main_store,
-            postings_lists_store,
-        )?;
-
-        // stop processing when time is running out
-        if let Some(timeout) = timeout {
-            if !raw_documents_processed.is_empty() && start_processing.elapsed() > timeout {
-                break;
-            }
-        }
-
-        let mut groups = vec![raw_documents.as_mut_slice()];
-        let mut key_cache = HashMap::new();
-
-        let mut filter_map = HashMap::new();
-        // these two variables informs on the current distinct map and
-        // on the raw offset of the start of the group where the
-        // range.start bound is located according to the distinct function
-        let mut distinct_map = DistinctMap::new(distinct_size);
-        let mut distinct_raw_offset = 0;
-
-        'criteria: for criterion in criteria.as_ref() {
-            let tmp_groups = mem::replace(&mut groups, Vec::new());
-            let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map);
-            let mut documents_seen = 0;
-
-            for group in tmp_groups {
-                // if this group does not overlap with the requested range,
-                // push it without sorting and splitting it
-                if documents_seen + group.len() < distinct_raw_offset {
-                    documents_seen += group.len();
-                    groups.push(group);
-                    continue;
-                }
-
-                group.sort_unstable_by(|a, b| criterion.evaluate(a, b));
-
-                for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
-                    // we must compute the real distinguished len of this sub-group
-                    for document in group.iter() {
-                        let filter_accepted = match &filter {
-                            Some(filter) => {
-                                let entry = filter_map.entry(document.id);
-                                *entry.or_insert_with(|| (filter)(document.id))
-                            }
-                            None => true,
-                        };
-
-                        if filter_accepted {
-                            let entry = key_cache.entry(document.id);
-                            let key = entry.or_insert_with(|| (distinct)(document.id).map(Rc::new));
-
-                            match key.clone() {
-                                Some(key) => buf_distinct.register(key),
-                                None => buf_distinct.register_without_key(),
-                            };
-                        }
-
-                        // the requested range end is reached: stop computing distinct
-                        if buf_distinct.len() >= range.end {
-                            break;
-                        }
-                    }
-
-                    documents_seen += group.len();
-                    groups.push(group);
-
-                    // if this sub-group does not overlap with the requested range
-                    // we must update the distinct map and its start index
-                    if buf_distinct.len() < range.start {
-                        buf_distinct.transfert_to_internal();
-                        distinct_raw_offset = documents_seen;
-                    }
-
-                    // we have sort enough documents if the last document sorted is after
-                    // the end of the requested range, we can continue to the next criterion
-                    if buf_distinct.len() >= range.end {
-                        continue 'criteria;
-                    }
-                }
-            }
-        }
-
-        // once we classified the documents related to the current
-        // automatons we save that as the next valid result
-        let mut seen = BufferedDistinctMap::new(&mut distinct_map);
-        raw_documents_processed.clear();
-
-        for document in raw_documents.into_iter().skip(distinct_raw_offset) {
-            let filter_accepted = match &filter {
-                Some(_) => filter_map.remove(&document.id).unwrap(),
-                None => true,
-            };
-
-            if filter_accepted {
-                let key = key_cache.remove(&document.id).unwrap();
-                let distinct_accepted = match key {
-                    Some(key) => seen.register(key),
-                    None => seen.register_without_key(),
-                };
-
-                if distinct_accepted && seen.len() > range.start {
-                    raw_documents_processed.push(document);
-                    if raw_documents_processed.len() == range.len() {
-                        break;
-                    }
-                }
-            }
-        }
-
-        // stop processing when time is running out
-        if let Some(timeout) = timeout {
-            if start_processing.elapsed() > timeout {
-                break;
-            }
-        }
-    }
-
-    // make real documents now that we know
-    // those must be returned
-    let documents = raw_documents_processed
-        .into_iter()
-        .map(Document::from_raw)
-        .collect();
-
-    Ok(documents)
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;