feat: Replace the HashMap by a simple Vec in stream ops

This commit is contained in:
Clément Renault 2018-09-10 19:47:40 +02:00
parent 31a83eae4d
commit f6a40ed7e4
5 changed files with 28 additions and 33 deletions

View File

@ -5,8 +5,8 @@ use std::path::Path;
use std::ops::Deref; use std::ops::Deref;
use std::sync::Arc; use std::sync::Arc;
use std::mem; use std::mem;
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use fst::raw::MmapReadOnly; use fst::raw::MmapReadOnly;
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use crate::DocIndex; use crate::DocIndex;
#[repr(C)] #[repr(C)]
@ -180,9 +180,6 @@ fn into_sliced_ranges<T>(vecs: Vec<Vec<T>>, number_docs: usize) -> (Vec<Range>,
let mut ranges = Vec::with_capacity(cap); let mut ranges = Vec::with_capacity(cap);
let mut values = Vec::with_capacity(number_docs); let mut values = Vec::with_capacity(number_docs);
// @Improvement: remove bounds duplications: the left bound of a range
// is already the right bound of the previous range,
// we could use a slice window of size 2.
for v in &vecs { for v in &vecs {
let len = v.len() as u64; let len = v.len() as u64;
let start = ranges.last().map(|&Range { end, .. }| end).unwrap_or(0); let start = ranges.last().map(|&Range { end, .. }| end).unwrap_or(0);

View File

@ -1,15 +1,13 @@
// pub mod difference;
// pub mod stream_ops;
mod ops_indexed_value; mod ops_indexed_value;
pub mod ops; pub mod ops;
pub mod doc_indexes; pub mod doc_indexes;
use fst::{Map, MapBuilder}; use fst::{Map, MapBuilder};
use self::doc_indexes::{DocIndexes, DocIndexesBuilder};
use std::error::Error; use std::error::Error;
use std::path::Path; use std::path::Path;
use std::io::Write; use std::io::Write;
use crate::DocIndex; use crate::DocIndex;
use self::doc_indexes::{DocIndexes, DocIndexesBuilder};
pub struct Metadata { pub struct Metadata {
map: Map, map: Map,
@ -87,8 +85,6 @@ impl<W: Write, X: Write> MetadataBuilder<W, X> {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::vec_read_only::VecReadOnly;
use crate::metadata::ops::IndexedDocIndexes;
#[test] #[test]
fn empty_serialize_deserialize() { fn empty_serialize_deserialize() {

View File

@ -1,5 +1,4 @@
use std::hash::{Hash, Hasher}; use std::collections::BTreeMap;
use std::collections::{HashMap, BTreeMap};
use fst::{map, Streamer, Automaton}; use fst::{map, Streamer, Automaton};
use fst::automaton::AlwaysMatch; use fst::automaton::AlwaysMatch;
use sdset::multi::OpBuilder as SdOpBuilder; use sdset::multi::OpBuilder as SdOpBuilder;
@ -9,7 +8,6 @@ use crate::metadata::ops_indexed_value::{
}; };
use crate::metadata::doc_indexes::DocIndexes; use crate::metadata::doc_indexes::DocIndexes;
use crate::metadata::Metadata; use crate::metadata::Metadata;
use crate::automaton::AutomatonExt;
use crate::vec_read_only::VecReadOnly; use crate::vec_read_only::VecReadOnly;
use crate::DocIndex; use crate::DocIndex;
@ -60,19 +58,19 @@ impl<'m, A: 'm + Automaton> OpBuilder<'m, A> {
} }
pub fn union(self) -> Union<'m> { pub fn union(self) -> Union<'m> {
Union::new(self.maps, self.indexes) Union::new(self.maps, self.indexes, self.automatons.len())
} }
pub fn intersection(self) -> Intersection<'m> { pub fn intersection(self) -> Intersection<'m> {
Intersection::new(self.maps, self.indexes) Intersection::new(self.maps, self.indexes, self.automatons.len())
} }
pub fn difference(self) -> Difference<'m> { pub fn difference(self) -> Difference<'m> {
Difference::new(self.maps, self.indexes) Difference::new(self.maps, self.indexes, self.automatons.len())
} }
pub fn symmetric_difference(self) -> SymmetricDifference<'m> { pub fn symmetric_difference(self) -> SymmetricDifference<'m> {
SymmetricDifference::new(self.maps, self.indexes) SymmetricDifference::new(self.maps, self.indexes, self.automatons.len())
} }
} }
@ -94,15 +92,16 @@ macro_rules! logical_operation {
pub struct $name<'m> { pub struct $name<'m> {
maps: UnionIndexedValue<'m>, maps: UnionIndexedValue<'m>,
indexes: Vec<&'m DocIndexes>, indexes: Vec<&'m DocIndexes>,
number_automatons: usize,
outs: Vec<IndexedDocIndexes>, outs: Vec<IndexedDocIndexes>,
} }
impl<'m> $name<'m> { impl<'m> $name<'m> {
fn new(maps: OpIndexedValueBuilder<'m>, indexes: Vec<&'m DocIndexes>) -> Self fn new(maps: OpIndexedValueBuilder<'m>, indexes: Vec<&'m DocIndexes>, number_automatons: usize) -> Self {
{
$name { $name {
maps: maps.union(), maps: maps.union(),
indexes: indexes, indexes: indexes,
number_automatons: number_automatons,
outs: Vec::new(), outs: Vec::new(),
} }
} }
@ -116,17 +115,15 @@ impl<'m, 'a> fst::Streamer<'a> for $name<'m> {
Some((input, ivalues)) => { Some((input, ivalues)) => {
self.outs.clear(); self.outs.clear();
// @Improvement: better use a `Vec` instead, let mut builders = vec![BTreeMap::new(); self.number_automatons];
// `aut indexes` follow them selfs
let mut builders = HashMap::new();
for iv in ivalues { for iv in ivalues {
let builder = builders.entry(iv.aut_index).or_insert_with(BTreeMap::new); let builder = &mut builders[iv.aut_index];
builder.insert(iv.rdr_index, iv.value); builder.insert(iv.rdr_index, iv.value);
} }
let mut doc_indexes = Vec::new(); let mut doc_indexes = Vec::new();
let mut doc_indexes_slots = Vec::with_capacity(builders.len()); let mut doc_indexes_slots = Vec::with_capacity(builders.len());
for (aut_index, values) in builders.into_iter() { for (aut_index, values) in builders.into_iter().enumerate() {
let mut builder = SdOpBuilder::with_capacity(values.len()); let mut builder = SdOpBuilder::with_capacity(values.len());
for (rdr_index, value) in values { for (rdr_index, value) in values {
let indexes = self.indexes[rdr_index].get(value).expect("could not find indexes"); let indexes = self.indexes[rdr_index].get(value).expect("could not find indexes");
@ -137,8 +134,7 @@ impl<'m, 'a> fst::Streamer<'a> for $name<'m> {
let start = doc_indexes.len(); let start = doc_indexes.len();
builder.$operation().extend_vec(&mut doc_indexes); builder.$operation().extend_vec(&mut doc_indexes);
let len = doc_indexes.len() - start; let len = doc_indexes.len() - start;
if len == 0 { continue } if len != 0 {
let slot = SlotIndexedDocIndexes { let slot = SlotIndexedDocIndexes {
index: aut_index, index: aut_index,
start: start, start: start,
@ -146,6 +142,7 @@ impl<'m, 'a> fst::Streamer<'a> for $name<'m> {
}; };
doc_indexes_slots.push(slot); doc_indexes_slots.push(slot);
} }
}
let read_only = VecReadOnly::new(doc_indexes); let read_only = VecReadOnly::new(doc_indexes);
self.outs.reserve(doc_indexes_slots.len()); self.outs.reserve(doc_indexes_slots.len());

View File

@ -38,6 +38,12 @@ pub struct UnionIndexedValue<'f> {
cur_slot: Option<SlotIndexedValue>, cur_slot: Option<SlotIndexedValue>,
} }
impl<'f> UnionIndexedValue<'f> {
pub fn len(&self) -> usize {
self.heap.num_slots()
}
}
impl<'a, 'm> fst::Streamer<'a> for UnionIndexedValue<'m> { impl<'a, 'm> fst::Streamer<'a> for UnionIndexedValue<'m> {
type Item = (&'a [u8], &'a [IndexedValue]); type Item = (&'a [u8], &'a [IndexedValue]);
@ -54,7 +60,7 @@ impl<'a, 'm> fst::Streamer<'a> for UnionIndexedValue<'m> {
}; };
self.outs.clear(); self.outs.clear();
self.outs.push(slot.indexed_value()); self.outs.push(slot.indexed_value());
while let Some(mut slot2) = self.heap.pop_if_equal(slot.input()) { while let Some(slot2) = self.heap.pop_if_equal(slot.input()) {
self.outs.push(slot2.indexed_value()); self.outs.push(slot2.indexed_value());
self.heap.refill(slot2); self.heap.refill(slot2);
} }

View File

@ -14,7 +14,6 @@ use group_by::GroupByMut;
use crate::automaton::{DfaExt, AutomatonExt}; use crate::automaton::{DfaExt, AutomatonExt};
use crate::metadata::Metadata; use crate::metadata::Metadata;
use crate::metadata::ops::{OpBuilder, Union}; use crate::metadata::ops::{OpBuilder, Union};
use crate::metadata::doc_indexes::DocIndexes;
use crate::{Match, DocumentId}; use crate::{Match, DocumentId};
use self::{ use self::{