deserialize the rtree only when its needed, and keep it in memory once it has been deserialized

2025-01-31 23:41:42 +08:00 · 2023-05-02 11:39:35 +02:00 · 2023-05-02 11:39:35 +02:00 · 8875d24a48
commit 8875d24a48
parent c470b67fa2
1 changed files with 9 additions and 29 deletions
--- a/milli/src/search/new/geo_sort.rs
+++ b/milli/src/search/new/geo_sort.rs
@ -111,12 +111,13 @@ impl<Q: RankingRuleQueryTrait> GeoSort<Q> {
        // if we had an rtree and the strategy doesn't require one anymore we can drop it
        let use_rtree = self.strategy.use_rtree(self.geo_candidates.len() as usize);
-        if !use_rtree && self.rtree.is_some() {
+        if use_rtree && self.rtree.is_none() {
-            self.rtree = None;
+            self.rtree = Some(ctx.index.geo_rtree(ctx.txn)?.expect("geo candidates but no rtree"));
        }
        let cache_size = self.strategy.cache_size();
-        if let Some(ref mut rtree) = self.rtree {
+        if use_rtree {
            let rtree = self.rtree.as_ref().unwrap();
            let point = lat_lng_to_xyz(&self.point);
            if self.ascending {
@ -169,18 +170,12 @@ impl<Q: RankingRuleQueryTrait> GeoSort<Q> {
                    ))
                })
                .collect::<Result<Vec<(u32, [f64; 2])>>>()?;
-            // REVIEW COMMENT: the haversine distance function can be quite expensive, I think, so it's probably faster
+            // computing the distance between two points is expensive thus we cache the result
            // to use `sort_by_cached_key` instead of `sort_by_key`.
            documents
                .sort_by_cached_key(|(_, p)| distance_between_two_points(&self.point, p) as usize);
            self.cached_sorted_docids.extend(documents.into_iter().map(|(doc_id, _)| doc_id));
        };
        if self.cached_sorted_docids.is_empty() && matches!(self.strategy, Strategy::AlwaysRtree(_))
        {
            // this shouldn't be possible
            self.rtree = None;
        }
        Ok(())
    }
 }
@ -210,15 +205,6 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort<Q> {
        let lat = fid_map.id("_geo.lat").expect("geo candidates but no fid for lat");
        let lng = fid_map.id("_geo.lng").expect("geo candidates but no fid for lng");
        self.field_ids = Some([lat, lng]);
        if self.strategy.use_rtree(self.geo_candidates.len() as usize) {
            // REVIEW COMMENT: I would prefer to always keep the rtree in memory so that we don't have to deserialize it
            // every time the geosort ranking rule starts iterating.
            // So we'd initialize it in `::new` and never drop it.
            //
            self.rtree = Some(ctx.index.geo_rtree(ctx.txn)?.expect("geo candidates but no rtree"));
        }
        self.fill_buffer(ctx)?;
        Ok(())
    }
@ -256,20 +242,14 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort<Q> {
        }
        // if we got out of this loop it means we've exhausted our cache.
-
+        // we need to refill it and run the function again.
-        if self.rtree.is_none() {
+        self.fill_buffer(ctx)?;
-            // with no rtree it means all geo candidates have been returned. We can return all the non geo-faceted documents
+        self.next_bucket(ctx, logger, universe)
            Ok(Some(RankingRuleOutput { query, candidates: universe.clone() }))
        } else {
            // else, we need to refill our bucket and run the function again
            self.fill_buffer(ctx)?;
            self.next_bucket(ctx, logger, universe)
        }
    }
    fn end_iteration(&mut self, _ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger<Q>) {
        // we do not reset the rtree here, it could be used in a next iteration
        self.query = None;
        self.rtree = None;
        self.cached_sorted_docids.clear();
    }
 }