Extract and pack the points to inserted them in the rtree

This commit is contained in:
Clément Renault 2024-11-12 17:28:09 +01:00
parent f9e5a06699
commit 19864bcff3
No known key found for this signature in database
GPG Key ID: F250A4C4E3AE5F5F
2 changed files with 80 additions and 39 deletions

View File

@ -1,10 +1,10 @@
use std::cell::RefCell;
use std::fs::File;
use std::io::{self, BufReader, BufWriter, ErrorKind, Read, Write as _};
use std::{mem, result};
use std::{iter, mem, result};
use bumpalo::Bump;
use bytemuck::{bytes_of, Pod, Zeroable};
use bytemuck::{bytes_of, from_bytes, pod_read_unaligned, Pod, Zeroable};
use heed::RoTxn;
use serde_json::value::RawValue;
use serde_json::Value;
@ -15,7 +15,7 @@ use crate::update::new::indexer::document_changes::{DocumentChangeContext, Extra
use crate::update::new::ref_cell_ext::RefCellExt as _;
use crate::update::new::DocumentChange;
use crate::update::GrenadParameters;
use crate::{DocumentId, Index, InternalError, Object, Result};
use crate::{lat_lng_to_xyz, DocumentId, GeoPoint, Index, InternalError, Object, Result};
pub struct GeoExtractor {
grenad_parameters: GrenadParameters,
@ -39,16 +39,26 @@ impl GeoExtractor {
#[derive(Pod, Zeroable, Copy, Clone)]
#[repr(C, packed)]
pub struct GeoPoint {
document_id: DocumentId,
lat_lng: [f64; 2],
pub struct ExtractedGeoPoint {
pub docid: DocumentId,
pub lat_lng: [f64; 2],
}
impl From<ExtractedGeoPoint> for GeoPoint {
/// Converts the latitude and longitude back to an xyz GeoPoint.
fn from(value: ExtractedGeoPoint) -> Self {
let [lat, lng] = value.lat_lng;
let point = [lat, lng];
let xyz_point = lat_lng_to_xyz(&point);
GeoPoint::new(xyz_point, (value.docid, point))
}
}
pub struct GeoExtractorData<'extractor> {
/// The set of documents ids that were removed. If a document sees its geo
/// point being updated, we first put it in the deleted and then in the inserted.
removed: bumpalo::collections::Vec<'extractor, GeoPoint>,
inserted: bumpalo::collections::Vec<'extractor, GeoPoint>,
removed: bumpalo::collections::Vec<'extractor, ExtractedGeoPoint>,
inserted: bumpalo::collections::Vec<'extractor, ExtractedGeoPoint>,
/// TODO Do the doc
spilled_removed: Option<BufWriter<File>>,
/// TODO Do the doc
@ -75,39 +85,50 @@ impl<'extractor> GeoExtractorData<'extractor> {
unsafe impl MostlySend for GeoExtractorData<'_> {}
pub struct FrozenGeoExtractorData<'extractor> {
pub removed: &'extractor [GeoPoint],
pub inserted: &'extractor [GeoPoint],
pub removed: &'extractor [ExtractedGeoPoint],
pub inserted: &'extractor [ExtractedGeoPoint],
pub spilled_removed: Option<BufReader<File>>,
pub spilled_inserted: Option<BufReader<File>>,
}
impl<'extractor> FrozenGeoExtractorData<'extractor> {
pub fn get_removed_and_clear(&mut self) -> io::Result<RoaringBitmap> {
let mut output = RoaringBitmap::new();
let mut iter = self.removed.iter_and_clear();
while let Some(block) = iter.next_block() {
let numbers = block.iter().copied();
output |= RoaringBitmap::from_sorted_iter(numbers).unwrap();
pub fn iter_and_clear_removed(
&mut self,
) -> impl IntoIterator<Item = io::Result<ExtractedGeoPoint>> + '_ {
mem::take(&mut self.removed)
.iter()
.copied()
.map(Ok)
.chain(iterator_over_spilled_geopoints(&mut self.spilled_removed))
}
if let Some(mut file) = self.spilled_removed.take() {
let mut number_bytes = [0u8; mem::size_of::<DocumentId>()];
loop {
match file.read_exact(&mut number_bytes) {
Ok(()) => {
let number = u32::from_be_bytes(number_bytes);
output.insert(number);
}
Err(e) if e.kind() == ErrorKind::UnexpectedEof => (),
Err(e) => return Err(e),
}
pub fn iter_and_clear_inserted(
&mut self,
) -> impl IntoIterator<Item = io::Result<ExtractedGeoPoint>> + '_ {
mem::take(&mut self.inserted)
.iter()
.copied()
.map(Ok)
.chain(iterator_over_spilled_geopoints(&mut self.spilled_inserted))
}
}
Ok(output)
fn iterator_over_spilled_geopoints(
spilled: &mut Option<BufReader<File>>,
) -> impl IntoIterator<Item = io::Result<ExtractedGeoPoint>> + '_ {
let mut spilled = spilled.take();
iter::from_fn(move || match &mut spilled {
Some(file) => {
let geopoint_bytes = &mut [0u8; mem::size_of::<ExtractedGeoPoint>()];
match file.read_exact(geopoint_bytes) {
Ok(()) => Some(Ok(pod_read_unaligned(geopoint_bytes))),
Err(e) if e.kind() == ErrorKind::UnexpectedEof => None,
Err(e) => return Some(Err(e)),
}
}
None => None,
})
}
impl<'extractor> Extractor<'extractor> for GeoExtractor {
type Data = RefCell<GeoExtractorData<'extractor>>;
@ -151,7 +172,7 @@ impl<'extractor> Extractor<'extractor> for GeoExtractor {
.transpose()?;
if let Some(lat_lng) = current_geo {
let geopoint = GeoPoint { document_id: docid, lat_lng };
let geopoint = ExtractedGeoPoint { docid, lat_lng };
match &mut data_ref.spilled_removed {
Some(file) => file.write_all(bytes_of(&geopoint))?,
None => data_ref.removed.push(geopoint),
@ -179,7 +200,7 @@ impl<'extractor> Extractor<'extractor> for GeoExtractor {
// we need to replace the current by the new point and therefore
// delete the current point from the RTree.
if let Some(lat_lng) = current_geo {
let geopoint = GeoPoint { document_id: docid, lat_lng };
let geopoint = ExtractedGeoPoint { docid, lat_lng };
match &mut data_ref.spilled_removed {
Some(file) => file.write_all(bytes_of(&geopoint))?,
None => data_ref.removed.push(geopoint),
@ -187,7 +208,7 @@ impl<'extractor> Extractor<'extractor> for GeoExtractor {
}
if let Some(lat_lng) = updated_geo {
let geopoint = GeoPoint { document_id: docid, lat_lng };
let geopoint = ExtractedGeoPoint { docid, lat_lng };
match &mut data_ref.spilled_inserted {
Some(file) => file.write_all(bytes_of(&geopoint))?,
None => data_ref.inserted.push(geopoint),
@ -206,7 +227,7 @@ impl<'extractor> Extractor<'extractor> for GeoExtractor {
.transpose()?;
if let Some(lat_lng) = inserted_geo {
let geopoint = GeoPoint { document_id: docid, lat_lng };
let geopoint = ExtractedGeoPoint { docid, lat_lng };
match &mut data_ref.spilled_inserted {
Some(file) => file.write_all(bytes_of(&geopoint))?,
None => data_ref.inserted.push(geopoint),

View File

@ -1,8 +1,10 @@
use std::cell::RefCell;
use std::io;
use hashbrown::HashSet;
use heed::types::Bytes;
use heed::{Database, RoTxn};
use memmap2::Mmap;
use rayon::iter::{IntoParallelIterator, ParallelIterator};
use roaring::RoaringBitmap;
@ -11,7 +13,7 @@ use super::extract::{
merge_caches, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap, FacetKind,
GeoExtractorData,
};
use crate::{CboRoaringBitmapCodec, FieldId, Index, InternalError, Result};
use crate::{CboRoaringBitmapCodec, FieldId, GeoPoint, Index, InternalError, Result};
#[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")]
pub fn merge_and_send_rtree<'extractor, MSP>(
@ -25,13 +27,31 @@ where
MSP: Fn() -> bool + Sync,
{
let mut rtree = index.geo_rtree(rtxn)?.unwrap_or_default();
for data in datastore {
let mut frozen = data.into_inner().freeze()?;
let removed = frozen.get_removed_and_clear()?;
removed.into_iter().for_each(|docid| rtree.remove(t));
if must_stop_processing() {
return Err(InternalError::AbortedIndexation.into());
}
let mut frozen = data.into_inner().freeze()?;
for result in frozen.iter_and_clear_removed() {
let extracted_geo_point = result?;
rtree.remove(&GeoPoint::from(extracted_geo_point));
}
for result in frozen.iter_and_clear_inserted() {
let extracted_geo_point = result?;
rtree.insert(GeoPoint::from(extracted_geo_point));
}
}
let mut file = tempfile::tempfile()?;
/// manage error
bincode::serialize_into(&mut file, &rtree).unwrap();
file.sync_all()?;
let rtree_mmap = unsafe { Mmap::map(&file)? };
geo_sender.set_rtree(rtree_mmap).unwrap();
Ok(())
}