Remove the serde Indexer

This commit is contained in:
Kerollmops 2020-05-18 12:22:06 +02:00
parent 615825b9fd
commit 5e063da14f
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
4 changed files with 74 additions and 370 deletions

View File

@ -34,7 +34,7 @@ pest_derive = "2.0"
regex = "1.3.6" regex = "1.3.6"
sdset = "0.4.0" sdset = "0.4.0"
serde = { version = "1.0.105", features = ["derive"] } serde = { version = "1.0.105", features = ["derive"] }
serde_json = "1.0.50" serde_json = { version = "1.0.50", features = ["preserve_order"] }
siphasher = "0.3.2" siphasher = "0.3.2"
slice-group-by = "0.2.6" slice-group-by = "0.2.6"
unicase = "2.6.0" unicase = "2.6.0"

View File

@ -1,362 +0,0 @@
use meilisearch_schema::IndexedPos;
use serde::ser;
use serde::Serialize;
use super::{ConvertToString, SerializerError};
use crate::raw_indexer::RawIndexer;
use crate::DocumentId;
pub struct Indexer<'a> {
pub pos: IndexedPos,
pub indexer: &'a mut RawIndexer,
pub document_id: DocumentId,
}
impl<'a> ser::Serializer for Indexer<'a> {
type Ok = Option<usize>;
type Error = SerializerError;
type SerializeSeq = SeqIndexer<'a>;
type SerializeTuple = TupleIndexer<'a>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = MapIndexer<'a>;
type SerializeStruct = StructIndexer<'a>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> {
Ok(None)
}
fn serialize_char(self, value: char) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_i8(self, value: i8) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_i16(self, value: i16) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_i32(self, value: i32) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_i64(self, value: i64) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_u8(self, value: u8) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_u16(self, value: u16) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_u32(self, value: u32) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_u64(self, value: u64) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_f32(self, value: f32) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_f64(self, value: f64) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_str(self, text: &str) -> Result<Self::Ok, Self::Error> {
let number_of_words = self
.indexer
.index_text(self.document_id, self.pos, text);
Ok(Some(number_of_words))
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnindexableType { type_name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Ok(None)
}
fn serialize_some<T: ?Sized>(self, value: &T) -> Result<Self::Ok, Self::Error>
where
T: ser::Serialize,
{
let text = value.serialize(ConvertToString)?;
let number_of_words = self
.indexer
.index_text(self.document_id, self.pos, &text);
Ok(Some(number_of_words))
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Ok(None)
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Ok(None)
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
) -> Result<Self::Ok, Self::Error> {
Ok(None)
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T,
) -> Result<Self::Ok, Self::Error>
where
T: ser::Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T,
) -> Result<Self::Ok, Self::Error>
where
T: ser::Serialize,
{
Err(SerializerError::UnindexableType {
type_name: "newtype variant",
})
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
let indexer = SeqIndexer {
pos: self.pos,
document_id: self.document_id,
indexer: self.indexer,
texts: Vec::new(),
};
Ok(indexer)
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
let indexer = TupleIndexer {
pos: self.pos,
document_id: self.document_id,
indexer: self.indexer,
texts: Vec::new(),
};
Ok(indexer)
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize,
) -> Result<Self::SerializeTupleStruct, Self::Error> {
Err(SerializerError::UnindexableType {
type_name: "tuple struct",
})
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize,
) -> Result<Self::SerializeTupleVariant, Self::Error> {
Err(SerializerError::UnindexableType {
type_name: "tuple variant",
})
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
let indexer = MapIndexer {
pos: self.pos,
document_id: self.document_id,
indexer: self.indexer,
texts: Vec::new(),
};
Ok(indexer)
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize,
) -> Result<Self::SerializeStruct, Self::Error> {
let indexer = StructIndexer {
pos: self.pos,
document_id: self.document_id,
indexer: self.indexer,
texts: Vec::new(),
};
Ok(indexer)
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize,
) -> Result<Self::SerializeStructVariant, Self::Error> {
Err(SerializerError::UnindexableType {
type_name: "struct variant",
})
}
}
pub struct SeqIndexer<'a> {
pos: IndexedPos,
document_id: DocumentId,
indexer: &'a mut RawIndexer,
texts: Vec<String>,
}
impl<'a> ser::SerializeSeq for SeqIndexer<'a> {
type Ok = Option<usize>;
type Error = SerializerError;
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where
T: ser::Serialize,
{
let text = value.serialize(ConvertToString)?;
self.texts.push(text);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
let texts = self.texts.iter().map(String::as_str);
self.indexer
.index_text_seq(self.document_id, self.pos, texts);
Ok(None)
}
}
pub struct MapIndexer<'a> {
pos: IndexedPos,
document_id: DocumentId,
indexer: &'a mut RawIndexer,
texts: Vec<String>,
}
impl<'a> ser::SerializeMap for MapIndexer<'a> {
type Ok = Option<usize>;
type Error = SerializerError;
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
where
T: ser::Serialize,
{
let text = key.serialize(ConvertToString)?;
self.texts.push(text);
Ok(())
}
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where
T: ser::Serialize,
{
let text = value.serialize(ConvertToString)?;
self.texts.push(text);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
let texts = self.texts.iter().map(String::as_str);
self.indexer
.index_text_seq(self.document_id, self.pos, texts);
Ok(None)
}
}
pub struct StructIndexer<'a> {
pos: IndexedPos,
document_id: DocumentId,
indexer: &'a mut RawIndexer,
texts: Vec<String>,
}
impl<'a> ser::SerializeStruct for StructIndexer<'a> {
type Ok = Option<usize>;
type Error = SerializerError;
fn serialize_field<T: ?Sized>(
&mut self,
key: &'static str,
value: &T,
) -> Result<(), Self::Error>
where
T: ser::Serialize,
{
let key_text = key.to_owned();
let value_text = value.serialize(ConvertToString)?;
self.texts.push(key_text);
self.texts.push(value_text);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
let texts = self.texts.iter().map(String::as_str);
self.indexer
.index_text_seq(self.document_id, self.pos, texts);
Ok(None)
}
}
pub struct TupleIndexer<'a> {
pos: IndexedPos,
document_id: DocumentId,
indexer: &'a mut RawIndexer,
texts: Vec<String>,
}
impl<'a> ser::SerializeTuple for TupleIndexer<'a> {
type Ok = Option<usize>;
type Error = SerializerError;
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where
T: Serialize,
{
let text = value.serialize(ConvertToString)?;
self.texts.push(text);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
let texts = self.texts.iter().map(String::as_str);
self.indexer
.index_text_seq(self.document_id, self.pos, texts);
Ok(None)
}
}

View File

@ -12,13 +12,11 @@ mod convert_to_number;
mod convert_to_string; mod convert_to_string;
mod deserializer; mod deserializer;
mod extract_document_id; mod extract_document_id;
mod indexer;
pub use self::convert_to_number::ConvertToNumber; pub use self::convert_to_number::ConvertToNumber;
pub use self::convert_to_string::ConvertToString; pub use self::convert_to_string::ConvertToString;
pub use self::deserializer::{Deserializer, DeserializerError}; pub use self::deserializer::{Deserializer, DeserializerError};
pub use self::extract_document_id::{compute_document_id, extract_document_id, value_to_string}; pub use self::extract_document_id::{compute_document_id, extract_document_id, value_to_string};
pub use self::indexer::Indexer;
use std::{error::Error, fmt}; use std::{error::Error, fmt};

View File

@ -1,4 +1,6 @@
use std::collections::HashMap; use std::collections::HashMap;
use std::fmt::Write as _;
use std::fmt;
use fst::{set::OpBuilder, SetBuilder}; use fst::{set::OpBuilder, SetBuilder};
use indexmap::IndexMap; use indexmap::IndexMap;
@ -6,12 +8,15 @@ use sdset::{duo::Union, SetOperation};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use serde_json::Value; use serde_json::Value;
use meilisearch_types::DocumentId;
use meilisearch_schema::IndexedPos;
use crate::database::{MainT, UpdateT}; use crate::database::{MainT, UpdateT};
use crate::database::{UpdateEvent, UpdateEventsEmitter}; use crate::database::{UpdateEvent, UpdateEventsEmitter};
use crate::facets; use crate::facets;
use crate::raw_indexer::RawIndexer; use crate::raw_indexer::RawIndexer;
use crate::serde::{extract_document_id, Deserializer}; use crate::serde::{extract_document_id, Deserializer};
use crate::serde::{ConvertToNumber, Indexer}; use crate::serde::ConvertToNumber;
use crate::store; use crate::store;
use crate::update::{apply_documents_deletion, compute_short_prefixes, next_update_id, Update}; use crate::update::{apply_documents_deletion, compute_short_prefixes, next_update_id, Update};
use crate::{Error, MResult, RankedMap}; use crate::{Error, MResult, RankedMap};
@ -106,6 +111,69 @@ pub fn push_documents_addition<D: serde::Serialize>(
Ok(last_update_id) Ok(last_update_id)
} }
// TODO move this helper functions elsewhere
/// Returns the number of words indexed or `None` if the type
fn index_value(
indexer: &mut RawIndexer,
document_id: DocumentId,
indexed_pos: IndexedPos,
value: &Value,
) -> Option<usize>
{
fn value_to_string(string: &mut String, value: &Value) {
match value {
Value::Null => (),
Value::Bool(boolean) => { let _ = write!(string, "{}", &boolean); },
Value::Number(number) => { let _ = write!(string, "{}", &number); },
Value::String(text) => string.push_str(&text),
Value::Array(array) => {
for value in array {
value_to_string(string, value);
let _ = string.write_str(". ");
}
},
Value::Object(object) => {
for (key, value) in object {
string.push_str(key);
let _ = string.write_str(". ");
value_to_string(string, value);
let _ = string.write_str(". ");
}
},
}
}
match value {
Value::Null => None,
Value::Bool(boolean) => {
let text = boolean.to_string();
let number_of_words = indexer.index_text(document_id, indexed_pos, &text);
Some(number_of_words)
},
Value::Number(number) => {
let text = number.to_string();
let number_of_words = indexer.index_text(document_id, indexed_pos, &text);
Some(number_of_words)
},
Value::String(string) => {
let number_of_words = indexer.index_text(document_id, indexed_pos, &string);
Some(number_of_words)
},
Value::Array(_) => {
let mut text = String::new();
value_to_string(&mut text, value);
let number_of_words = indexer.index_text(document_id, indexed_pos, &text);
Some(number_of_words)
},
Value::Object(_) => {
let mut text = String::new();
value_to_string(&mut text, value);
let number_of_words = indexer.index_text(document_id, indexed_pos, &text);
Some(number_of_words)
},
}
}
pub fn apply_addition<'a, 'b>( pub fn apply_addition<'a, 'b>(
writer: &'a mut heed::RwTxn<'b, MainT>, writer: &'a mut heed::RwTxn<'b, MainT>,
index: &store::Index, index: &store::Index,
@ -183,8 +251,8 @@ pub fn apply_addition<'a, 'b>(
index.documents_fields.put_document_field(writer, document_id, field_id, &serialized)?; index.documents_fields.put_document_field(writer, document_id, field_id, &serialized)?;
if let Some(indexed_pos) = schema.is_indexed(field_id) { if let Some(indexed_pos) = schema.is_indexed(field_id) {
let indexer = Indexer { pos: *indexed_pos, indexer: &mut indexer, document_id }; let number_of_words = index_value(&mut indexer, document_id, *indexed_pos, &value);
if let Some(number_of_words) = value.serialize(indexer)? { if let Some(number_of_words) = number_of_words {
index.documents_fields_counts.put_document_field_count( index.documents_fields_counts.put_document_field_count(
writer, writer,
document_id, document_id,
@ -280,8 +348,8 @@ pub fn reindex_all_documents(writer: &mut heed::RwTxn<MainT>, index: &store::Ind
index.documents_fields.put_document_field(writer, document_id, field_id, &serialized)?; index.documents_fields.put_document_field(writer, document_id, field_id, &serialized)?;
if let Some(indexed_pos) = schema.is_indexed(field_id) { if let Some(indexed_pos) = schema.is_indexed(field_id) {
let indexer = Indexer { pos: *indexed_pos, indexer: &mut indexer, document_id }; let number_of_words = index_value(&mut indexer, document_id, *indexed_pos, &value);
if let Some(number_of_words) = value.serialize(indexer)? { if let Some(number_of_words) = number_of_words {
index.documents_fields_counts.put_document_field_count( index.documents_fields_counts.put_document_field_count(
writer, writer,
document_id, document_id,