70 lines
2.6 KiB
Rust
Raw Normal View History

2021-08-23 18:41:48 +02:00
use std::fs::File;
use std::io;
2022-05-02 19:19:50 +02:00
use std::result::Result as StdResult;
2021-08-23 18:41:48 +02:00
use concat_arrays::concat_arrays;
2022-05-02 19:19:50 +02:00
use serde_json::Value;
2021-08-23 18:41:48 +02:00
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
2022-05-02 19:19:50 +02:00
use crate::error::GeoError;
use crate::{FieldId, InternalError, Result, UserError};
2021-08-23 18:41:48 +02:00
/// Extracts the geographical coordinates contained in each document under the `_geo` field.
///
/// Returns the generated grenad reader containing the docid as key associated to the (latitude, longitude)
2022-02-16 15:28:48 +01:00
pub fn extract_geo_points<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>,
2021-08-23 18:41:48 +02:00
indexer: GrenadParameters,
primary_key_id: FieldId,
2022-03-23 17:28:41 +01:00
(lat_fid, lng_fid): (FieldId, FieldId),
2021-08-23 18:41:48 +02:00
) -> Result<grenad::Reader<File>> {
2022-02-16 15:28:48 +01:00
let mut writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
2021-08-23 18:41:48 +02:00
2022-02-16 15:28:48 +01:00
let mut cursor = obkv_documents.into_cursor()?;
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
2021-08-23 18:41:48 +02:00
let obkv = obkv::KvReader::new(value);
2022-05-02 19:19:50 +02:00
// since we only needs the primary key when we throw an error we create this getter to
// lazily get it when needed
let primary_key = || -> Value {
2021-09-08 18:12:10 +02:00
let primary_key = obkv.get(primary_key_id).unwrap();
2022-05-02 19:19:50 +02:00
serde_json::from_slice(primary_key).unwrap()
};
// first we get the two fields
let lat = obkv.get(lat_fid).ok_or_else(|| -> UserError {
GeoError::MissingLatitude { document_id: primary_key() }.into()
})?;
let lng = obkv.get(lng_fid).ok_or_else(|| -> UserError {
GeoError::MissingLongitude { document_id: primary_key() }.into()
2022-03-23 17:28:41 +01:00
})?;
2022-05-02 19:19:50 +02:00
// then we extract the values
let lat = extract_value(serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?)
.map_err(|lat| -> UserError {
GeoError::BadLatitude { document_id: primary_key(), value: lat }.into()
})?;
let lng = extract_value(serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?)
.map_err(|lng| -> UserError {
GeoError::BadLongitude { document_id: primary_key(), value: lng }.into()
})?;
2022-03-23 17:28:41 +01:00
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
writer.insert(docid_bytes, bytes)?;
2021-08-23 18:41:48 +02:00
}
Ok(writer_into_reader(writer)?)
}
2022-05-02 19:19:50 +02:00
fn extract_value(value: Value) -> StdResult<f64, Value> {
match value {
Value::Number(ref n) => n.as_f64().ok_or(value),
Value::String(ref s) => s.parse::<f64>().map_err(|_| value),
value => Err(value),
}
}