Extract and pack the points to inserted them in the rtree

This commit is contained in:
Clément Renault 2024-11-12 17:28:09 +01:00
parent 609545072f
commit 66ba176df8
No known key found for this signature in database
GPG Key ID: F250A4C4E3AE5F5F
2 changed files with 80 additions and 39 deletions

View File

@ -1,10 +1,10 @@
use std::cell::RefCell; use std::cell::RefCell;
use std::fs::File; use std::fs::File;
use std::io::{self, BufReader, BufWriter, ErrorKind, Read, Write as _}; use std::io::{self, BufReader, BufWriter, ErrorKind, Read, Write as _};
use std::{mem, result}; use std::{iter, mem, result};
use bumpalo::Bump; use bumpalo::Bump;
use bytemuck::{bytes_of, Pod, Zeroable}; use bytemuck::{bytes_of, from_bytes, pod_read_unaligned, Pod, Zeroable};
use heed::RoTxn; use heed::RoTxn;
use serde_json::value::RawValue; use serde_json::value::RawValue;
use serde_json::Value; use serde_json::Value;
@ -15,7 +15,7 @@ use crate::update::new::indexer::document_changes::{DocumentChangeContext, Extra
use crate::update::new::ref_cell_ext::RefCellExt as _; use crate::update::new::ref_cell_ext::RefCellExt as _;
use crate::update::new::DocumentChange; use crate::update::new::DocumentChange;
use crate::update::GrenadParameters; use crate::update::GrenadParameters;
use crate::{DocumentId, Index, InternalError, Object, Result}; use crate::{lat_lng_to_xyz, DocumentId, GeoPoint, Index, InternalError, Object, Result};
pub struct GeoExtractor { pub struct GeoExtractor {
grenad_parameters: GrenadParameters, grenad_parameters: GrenadParameters,
@ -39,16 +39,26 @@ impl GeoExtractor {
#[derive(Pod, Zeroable, Copy, Clone)] #[derive(Pod, Zeroable, Copy, Clone)]
#[repr(C, packed)] #[repr(C, packed)]
pub struct GeoPoint { pub struct ExtractedGeoPoint {
document_id: DocumentId, pub docid: DocumentId,
lat_lng: [f64; 2], pub lat_lng: [f64; 2],
}
impl From<ExtractedGeoPoint> for GeoPoint {
/// Converts the latitude and longitude back to an xyz GeoPoint.
fn from(value: ExtractedGeoPoint) -> Self {
let [lat, lng] = value.lat_lng;
let point = [lat, lng];
let xyz_point = lat_lng_to_xyz(&point);
GeoPoint::new(xyz_point, (value.docid, point))
}
} }
pub struct GeoExtractorData<'extractor> { pub struct GeoExtractorData<'extractor> {
/// The set of documents ids that were removed. If a document sees its geo /// The set of documents ids that were removed. If a document sees its geo
/// point being updated, we first put it in the deleted and then in the inserted. /// point being updated, we first put it in the deleted and then in the inserted.
removed: bumpalo::collections::Vec<'extractor, GeoPoint>, removed: bumpalo::collections::Vec<'extractor, ExtractedGeoPoint>,
inserted: bumpalo::collections::Vec<'extractor, GeoPoint>, inserted: bumpalo::collections::Vec<'extractor, ExtractedGeoPoint>,
/// TODO Do the doc /// TODO Do the doc
spilled_removed: Option<BufWriter<File>>, spilled_removed: Option<BufWriter<File>>,
/// TODO Do the doc /// TODO Do the doc
@ -75,39 +85,50 @@ impl<'extractor> GeoExtractorData<'extractor> {
unsafe impl MostlySend for GeoExtractorData<'_> {} unsafe impl MostlySend for GeoExtractorData<'_> {}
pub struct FrozenGeoExtractorData<'extractor> { pub struct FrozenGeoExtractorData<'extractor> {
pub removed: &'extractor [GeoPoint], pub removed: &'extractor [ExtractedGeoPoint],
pub inserted: &'extractor [GeoPoint], pub inserted: &'extractor [ExtractedGeoPoint],
pub spilled_removed: Option<BufReader<File>>, pub spilled_removed: Option<BufReader<File>>,
pub spilled_inserted: Option<BufReader<File>>, pub spilled_inserted: Option<BufReader<File>>,
} }
impl<'extractor> FrozenGeoExtractorData<'extractor> { impl<'extractor> FrozenGeoExtractorData<'extractor> {
pub fn get_removed_and_clear(&mut self) -> io::Result<RoaringBitmap> { pub fn iter_and_clear_removed(
let mut output = RoaringBitmap::new(); &mut self,
) -> impl IntoIterator<Item = io::Result<ExtractedGeoPoint>> + '_ {
let mut iter = self.removed.iter_and_clear(); mem::take(&mut self.removed)
while let Some(block) = iter.next_block() { .iter()
let numbers = block.iter().copied(); .copied()
output |= RoaringBitmap::from_sorted_iter(numbers).unwrap(); .map(Ok)
.chain(iterator_over_spilled_geopoints(&mut self.spilled_removed))
} }
if let Some(mut file) = self.spilled_removed.take() { pub fn iter_and_clear_inserted(
let mut number_bytes = [0u8; mem::size_of::<DocumentId>()]; &mut self,
loop { ) -> impl IntoIterator<Item = io::Result<ExtractedGeoPoint>> + '_ {
match file.read_exact(&mut number_bytes) { mem::take(&mut self.inserted)
Ok(()) => { .iter()
let number = u32::from_be_bytes(number_bytes); .copied()
output.insert(number); .map(Ok)
} .chain(iterator_over_spilled_geopoints(&mut self.spilled_inserted))
Err(e) if e.kind() == ErrorKind::UnexpectedEof => (),
Err(e) => return Err(e),
}
} }
} }
Ok(output) fn iterator_over_spilled_geopoints(
spilled: &mut Option<BufReader<File>>,
) -> impl IntoIterator<Item = io::Result<ExtractedGeoPoint>> + '_ {
let mut spilled = spilled.take();
iter::from_fn(move || match &mut spilled {
Some(file) => {
let geopoint_bytes = &mut [0u8; mem::size_of::<ExtractedGeoPoint>()];
match file.read_exact(geopoint_bytes) {
Ok(()) => Some(Ok(pod_read_unaligned(geopoint_bytes))),
Err(e) if e.kind() == ErrorKind::UnexpectedEof => None,
Err(e) => return Some(Err(e)),
} }
} }
None => None,
})
}
impl<'extractor> Extractor<'extractor> for GeoExtractor { impl<'extractor> Extractor<'extractor> for GeoExtractor {
type Data = RefCell<GeoExtractorData<'extractor>>; type Data = RefCell<GeoExtractorData<'extractor>>;
@ -151,7 +172,7 @@ impl<'extractor> Extractor<'extractor> for GeoExtractor {
.transpose()?; .transpose()?;
if let Some(lat_lng) = current_geo { if let Some(lat_lng) = current_geo {
let geopoint = GeoPoint { document_id: docid, lat_lng }; let geopoint = ExtractedGeoPoint { docid, lat_lng };
match &mut data_ref.spilled_removed { match &mut data_ref.spilled_removed {
Some(file) => file.write_all(bytes_of(&geopoint))?, Some(file) => file.write_all(bytes_of(&geopoint))?,
None => data_ref.removed.push(geopoint), None => data_ref.removed.push(geopoint),
@ -179,7 +200,7 @@ impl<'extractor> Extractor<'extractor> for GeoExtractor {
// we need to replace the current by the new point and therefore // we need to replace the current by the new point and therefore
// delete the current point from the RTree. // delete the current point from the RTree.
if let Some(lat_lng) = current_geo { if let Some(lat_lng) = current_geo {
let geopoint = GeoPoint { document_id: docid, lat_lng }; let geopoint = ExtractedGeoPoint { docid, lat_lng };
match &mut data_ref.spilled_removed { match &mut data_ref.spilled_removed {
Some(file) => file.write_all(bytes_of(&geopoint))?, Some(file) => file.write_all(bytes_of(&geopoint))?,
None => data_ref.removed.push(geopoint), None => data_ref.removed.push(geopoint),
@ -187,7 +208,7 @@ impl<'extractor> Extractor<'extractor> for GeoExtractor {
} }
if let Some(lat_lng) = updated_geo { if let Some(lat_lng) = updated_geo {
let geopoint = GeoPoint { document_id: docid, lat_lng }; let geopoint = ExtractedGeoPoint { docid, lat_lng };
match &mut data_ref.spilled_inserted { match &mut data_ref.spilled_inserted {
Some(file) => file.write_all(bytes_of(&geopoint))?, Some(file) => file.write_all(bytes_of(&geopoint))?,
None => data_ref.inserted.push(geopoint), None => data_ref.inserted.push(geopoint),
@ -206,7 +227,7 @@ impl<'extractor> Extractor<'extractor> for GeoExtractor {
.transpose()?; .transpose()?;
if let Some(lat_lng) = inserted_geo { if let Some(lat_lng) = inserted_geo {
let geopoint = GeoPoint { document_id: docid, lat_lng }; let geopoint = ExtractedGeoPoint { docid, lat_lng };
match &mut data_ref.spilled_inserted { match &mut data_ref.spilled_inserted {
Some(file) => file.write_all(bytes_of(&geopoint))?, Some(file) => file.write_all(bytes_of(&geopoint))?,
None => data_ref.inserted.push(geopoint), None => data_ref.inserted.push(geopoint),

View File

@ -1,8 +1,10 @@
use std::cell::RefCell; use std::cell::RefCell;
use std::io;
use hashbrown::HashSet; use hashbrown::HashSet;
use heed::types::Bytes; use heed::types::Bytes;
use heed::{Database, RoTxn}; use heed::{Database, RoTxn};
use memmap2::Mmap;
use rayon::iter::{IntoParallelIterator, ParallelIterator}; use rayon::iter::{IntoParallelIterator, ParallelIterator};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
@ -11,7 +13,7 @@ use super::extract::{
merge_caches, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap, FacetKind, merge_caches, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap, FacetKind,
GeoExtractorData, GeoExtractorData,
}; };
use crate::{CboRoaringBitmapCodec, FieldId, Index, InternalError, Result}; use crate::{CboRoaringBitmapCodec, FieldId, GeoPoint, Index, InternalError, Result};
#[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")] #[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")]
pub fn merge_and_send_rtree<'extractor, MSP>( pub fn merge_and_send_rtree<'extractor, MSP>(
@ -25,13 +27,31 @@ where
MSP: Fn() -> bool + Sync, MSP: Fn() -> bool + Sync,
{ {
let mut rtree = index.geo_rtree(rtxn)?.unwrap_or_default(); let mut rtree = index.geo_rtree(rtxn)?.unwrap_or_default();
for data in datastore { for data in datastore {
let mut frozen = data.into_inner().freeze()?; if must_stop_processing() {
let removed = frozen.get_removed_and_clear()?; return Err(InternalError::AbortedIndexation.into());
removed.into_iter().for_each(|docid| rtree.remove(t));
} }
let mut frozen = data.into_inner().freeze()?;
for result in frozen.iter_and_clear_removed() {
let extracted_geo_point = result?;
rtree.remove(&GeoPoint::from(extracted_geo_point));
}
for result in frozen.iter_and_clear_inserted() {
let extracted_geo_point = result?;
rtree.insert(GeoPoint::from(extracted_geo_point));
}
}
let mut file = tempfile::tempfile()?;
/// manage error
bincode::serialize_into(&mut file, &rtree).unwrap();
file.sync_all()?;
let rtree_mmap = unsafe { Mmap::map(&file)? };
geo_sender.set_rtree(rtree_mmap).unwrap();
Ok(()) Ok(())
} }