69 lines
2.7 KiB
Rust
Raw Normal View History

2021-08-23 18:41:48 +02:00
use std::fs::File;
use std::io::{self, BufReader};
2021-08-23 18:41:48 +02:00
use concat_arrays::concat_arrays;
2022-05-02 19:19:50 +02:00
use serde_json::Value;
2021-08-23 18:41:48 +02:00
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
2022-05-02 19:19:50 +02:00
use crate::error::GeoError;
use crate::update::index_documents::extract_finite_float_from_value;
use crate::{FieldId, InternalError, Result};
2021-08-23 18:41:48 +02:00
/// Extracts the geographical coordinates contained in each document under the `_geo` field.
///
/// Returns the generated grenad reader containing the docid as key associated to the (latitude, longitude)
#[logging_timer::time]
2022-02-16 15:28:48 +01:00
pub fn extract_geo_points<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>,
2021-08-23 18:41:48 +02:00
indexer: GrenadParameters,
primary_key_id: FieldId,
2022-03-23 17:28:41 +01:00
(lat_fid, lng_fid): (FieldId, FieldId),
) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!();
2022-02-16 15:28:48 +01:00
let mut writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
2021-08-23 18:41:48 +02:00
2022-02-16 15:28:48 +01:00
let mut cursor = obkv_documents.into_cursor()?;
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
2021-08-23 18:41:48 +02:00
let obkv = obkv::KvReader::new(value);
2022-05-02 19:19:50 +02:00
// since we only needs the primary key when we throw an error we create this getter to
// lazily get it when needed
let document_id = || -> Value {
let document_id = obkv.get(primary_key_id).unwrap();
serde_json::from_slice(document_id).unwrap()
2022-05-02 19:19:50 +02:00
};
// first we get the two fields
let lat = obkv.get(lat_fid);
let lng = obkv.get(lng_fid);
2022-05-02 19:19:50 +02:00
if let Some((lat, lng)) = lat.zip(lng) {
// then we extract the values
let lat = extract_finite_float_from_value(
serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
)
.map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?;
2022-05-02 19:19:50 +02:00
let lng = extract_finite_float_from_value(
serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?,
)
.map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?;
2022-03-23 17:28:41 +01:00
#[allow(clippy::drop_non_drop)]
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
writer.insert(docid_bytes, bytes)?;
} else if lat.is_none() && lng.is_some() {
return Err(GeoError::MissingLatitude { document_id: document_id() })?;
} else if lat.is_some() && lng.is_none() {
return Err(GeoError::MissingLongitude { document_id: document_id() })?;
}
2023-02-20 13:45:51 +01:00
// else => the _geo object was `null`, there is nothing to do
2021-08-23 18:41:48 +02:00
}
writer_into_reader(writer)
2021-08-23 18:41:48 +02:00
}