Move to the hgg crate

This commit is contained in:
Kerollmops 2023-06-13 15:19:01 +02:00 committed by Clément Renault
parent 642b0f3a1b
commit 268a9ef416
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
9 changed files with 73 additions and 99 deletions

54
Cargo.lock generated
View File

@ -1736,9 +1736,6 @@ name = "hashbrown"
version = "0.11.2" version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e"
dependencies = [
"ahash 0.7.6",
]
[[package]] [[package]]
name = "hashbrown" name = "hashbrown"
@ -1749,6 +1746,12 @@ dependencies = [
"ahash 0.7.6", "ahash 0.7.6",
] ]
[[package]]
name = "header-vec"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bda7e66d32131841c4264e34a32c934df0dedb08d737f861326d616d4338f06f"
[[package]] [[package]]
name = "heapless" name = "heapless"
version = "0.7.16" version = "0.7.16"
@ -1832,6 +1835,19 @@ version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
[[package]]
name = "hgg"
version = "0.4.2-alpha.0"
source = "git+https://github.com/rust-cv/hgg#6d1eacde635158163fb663d9327a2d6f612dd435"
dependencies = [
"ahash 0.7.6",
"hashbrown 0.11.2",
"header-vec",
"num-traits",
"serde",
"space",
]
[[package]] [[package]]
name = "hmac" name = "hmac"
version = "0.12.1" version = "0.12.1"
@ -1841,22 +1857,6 @@ dependencies = [
"digest", "digest",
] ]
[[package]]
name = "hnsw"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b9740ebf8769ec4ad6762cc951ba18f39bba6dfbc2fbbe46285f7539af79752"
dependencies = [
"ahash 0.7.6",
"hashbrown 0.11.2",
"libm",
"num-traits",
"rand_core",
"serde",
"smallvec",
"space",
]
[[package]] [[package]]
name = "http" name = "http"
version = "0.2.9" version = "0.2.9"
@ -2729,7 +2729,7 @@ dependencies = [
"geoutils", "geoutils",
"grenad", "grenad",
"heed", "heed",
"hnsw", "hgg",
"insta", "insta",
"itertools", "itertools",
"json-depth-checker", "json-depth-checker",
@ -2744,7 +2744,6 @@ dependencies = [
"once_cell", "once_cell",
"ordered-float", "ordered-float",
"rand", "rand",
"rand_pcg",
"rayon", "rayon",
"roaring", "roaring",
"rstar", "rstar",
@ -3307,16 +3306,6 @@ dependencies = [
"getrandom", "getrandom",
] ]
[[package]]
name = "rand_pcg"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59cad018caf63deb318e5a4586d99a24424a364f40f1e5778c29aca23f4fc73e"
dependencies = [
"rand_core",
"serde",
]
[[package]] [[package]]
name = "rayon" name = "rayon"
version = "1.7.0" version = "1.7.0"
@ -3776,9 +3765,6 @@ name = "smallvec"
version = "1.10.0" version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"
dependencies = [
"serde",
]
[[package]] [[package]]
name = "smartstring" name = "smartstring"

View File

@ -33,14 +33,13 @@ heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.6", default-f
"lmdb", "lmdb",
"sync-read-txn", "sync-read-txn",
] } ] }
hnsw = { version = "0.11.0", features = ["serde1"] } hgg = { git = "https://github.com/rust-cv/hgg", features = ["serde"] }
json-depth-checker = { path = "../json-depth-checker" } json-depth-checker = { path = "../json-depth-checker" }
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
memmap2 = "0.5.10" memmap2 = "0.5.10"
obkv = "0.2.0" obkv = "0.2.0"
once_cell = "1.17.1" once_cell = "1.17.1"
ordered-float = "3.6.0" ordered-float = "3.6.0"
rand_pcg = { version = "0.3.1", features = ["serde1"] }
rayon = "1.7.0" rayon = "1.7.0"
roaring = "0.10.1" roaring = "0.10.1"
rstar = { version = "0.10.0", features = ["serde"] } rstar = { version = "0.10.0", features = ["serde"] }

View File

@ -18,3 +18,17 @@ impl Metric<Vec<f32>> for DotProduct {
dist.to_bits() dist.to_bits()
} }
} }
#[derive(Debug, Default, Clone, Copy, Serialize, Deserialize)]
pub struct Euclidean;
impl Metric<Vec<f32>> for Euclidean {
type Unit = u32;
fn distance(&self, a: &Vec<f32>, b: &Vec<f32>) -> Self::Unit {
let squared: f32 = a.iter().zip(b).map(|(a, b)| (a - b).powi(2)).sum();
let dist = squared.sqrt();
debug_assert!(!dist.is_nan());
dist.to_bits()
}
}

View File

@ -8,12 +8,11 @@ use charabia::{Language, Script};
use heed::flags::Flags; use heed::flags::Flags;
use heed::types::*; use heed::types::*;
use heed::{CompactionOption, Database, PolyDatabase, RoTxn, RwTxn}; use heed::{CompactionOption, Database, PolyDatabase, RoTxn, RwTxn};
use rand_pcg::Pcg32;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use rstar::RTree; use rstar::RTree;
use time::OffsetDateTime; use time::OffsetDateTime;
use crate::dot_product::DotProduct; use crate::distance::Euclidean;
use crate::error::{InternalError, UserError}; use crate::error::{InternalError, UserError};
use crate::facet::FacetType; use crate::facet::FacetType;
use crate::fields_ids_map::FieldsIdsMap; use crate::fields_ids_map::FieldsIdsMap;
@ -28,8 +27,8 @@ use crate::{
Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, BEU16, BEU32, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, BEU16, BEU32,
}; };
/// The HNSW data-structure that we serialize, fill and search in. /// The HGG data-structure that we serialize, fill and search in.
pub type Hnsw = hnsw::Hnsw<DotProduct, Vec<f32>, Pcg32, 12, 24>; pub type Hgg = hgg::Hgg<Euclidean, Vec<f32>, DocumentId>;
pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5;
pub const DEFAULT_MIN_WORD_LEN_TWO_TYPOS: u8 = 9; pub const DEFAULT_MIN_WORD_LEN_TWO_TYPOS: u8 = 9;
@ -47,7 +46,7 @@ pub mod main_key {
pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map";
pub const GEO_FACETED_DOCUMENTS_IDS_KEY: &str = "geo-faceted-documents-ids"; pub const GEO_FACETED_DOCUMENTS_IDS_KEY: &str = "geo-faceted-documents-ids";
pub const GEO_RTREE_KEY: &str = "geo-rtree"; pub const GEO_RTREE_KEY: &str = "geo-rtree";
pub const VECTOR_HNSW_KEY: &str = "vector-hnsw"; pub const VECTOR_HGG_KEY: &str = "vector-hgg";
pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids";
pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids"; pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids";
pub const PRIMARY_KEY_KEY: &str = "primary-key"; pub const PRIMARY_KEY_KEY: &str = "primary-key";
@ -92,7 +91,6 @@ pub mod db_name {
pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids"; pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids";
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s"; pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings"; pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
pub const VECTOR_ID_DOCID: &str = "vector-id-docids";
pub const DOCUMENTS: &str = "documents"; pub const DOCUMENTS: &str = "documents";
pub const SCRIPT_LANGUAGE_DOCIDS: &str = "script_language_docids"; pub const SCRIPT_LANGUAGE_DOCIDS: &str = "script_language_docids";
} }
@ -156,9 +154,6 @@ pub struct Index {
/// Maps the document id, the facet field id and the strings. /// Maps the document id, the facet field id and the strings.
pub field_id_docid_facet_strings: Database<FieldDocIdFacetStringCodec, Str>, pub field_id_docid_facet_strings: Database<FieldDocIdFacetStringCodec, Str>,
/// Maps a vector id to the document id that have it.
pub vector_id_docid: Database<OwnedType<BEU32>, OwnedType<BEU32>>,
/// Maps the document id to the document as an obkv store. /// Maps the document id to the document as an obkv store.
pub(crate) documents: Database<OwnedType<BEU32>, ObkvCodec>, pub(crate) documents: Database<OwnedType<BEU32>, ObkvCodec>,
} }
@ -172,7 +167,7 @@ impl Index {
) -> Result<Index> { ) -> Result<Index> {
use db_name::*; use db_name::*;
options.max_dbs(24); options.max_dbs(23);
unsafe { options.flag(Flags::MdbAlwaysFreePages) }; unsafe { options.flag(Flags::MdbAlwaysFreePages) };
let env = options.open(path)?; let env = options.open(path)?;
@ -212,7 +207,6 @@ impl Index {
env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_F64S))?; env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_F64S))?;
let field_id_docid_facet_strings = let field_id_docid_facet_strings =
env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_STRINGS))?; env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_STRINGS))?;
let vector_id_docid = env.create_database(&mut wtxn, Some(VECTOR_ID_DOCID))?;
let documents = env.create_database(&mut wtxn, Some(DOCUMENTS))?; let documents = env.create_database(&mut wtxn, Some(DOCUMENTS))?;
wtxn.commit()?; wtxn.commit()?;
@ -241,7 +235,6 @@ impl Index {
facet_id_is_empty_docids, facet_id_is_empty_docids,
field_id_docid_facet_f64s, field_id_docid_facet_f64s,
field_id_docid_facet_strings, field_id_docid_facet_strings,
vector_id_docid,
documents, documents,
}) })
} }
@ -513,22 +506,22 @@ impl Index {
} }
} }
/* vector HNSW */ /* vector HGG */
/// Writes the provided `hnsw`. /// Writes the provided `hgg`.
pub(crate) fn put_vector_hnsw(&self, wtxn: &mut RwTxn, hnsw: &Hnsw) -> heed::Result<()> { pub(crate) fn put_vector_hgg(&self, wtxn: &mut RwTxn, hgg: &Hgg) -> heed::Result<()> {
self.main.put::<_, Str, SerdeBincode<Hnsw>>(wtxn, main_key::VECTOR_HNSW_KEY, hnsw) self.main.put::<_, Str, SerdeBincode<Hgg>>(wtxn, main_key::VECTOR_HGG_KEY, hgg)
} }
/// Delete the `hnsw`. /// Delete the `hgg`.
pub(crate) fn delete_vector_hnsw(&self, wtxn: &mut RwTxn) -> heed::Result<bool> { pub(crate) fn delete_vector_hgg(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
self.main.delete::<_, Str>(wtxn, main_key::VECTOR_HNSW_KEY) self.main.delete::<_, Str>(wtxn, main_key::VECTOR_HGG_KEY)
} }
/// Returns the `hnsw`. /// Returns the `hgg`.
pub fn vector_hnsw(&self, rtxn: &RoTxn) -> Result<Option<Hnsw>> { pub fn vector_hgg(&self, rtxn: &RoTxn) -> Result<Option<Hgg>> {
match self.main.get::<_, Str, SerdeBincode<Hnsw>>(rtxn, main_key::VECTOR_HNSW_KEY)? { match self.main.get::<_, Str, SerdeBincode<Hgg>>(rtxn, main_key::VECTOR_HGG_KEY)? {
Some(hnsw) => Ok(Some(hnsw)), Some(hgg) => Ok(Some(hgg)),
None => Ok(None), None => Ok(None),
} }
} }

View File

@ -10,7 +10,7 @@ pub mod documents;
mod asc_desc; mod asc_desc;
mod criterion; mod criterion;
pub mod dot_product; mod distance;
mod error; mod error;
mod external_documents_ids; mod external_documents_ids;
pub mod facet; pub mod facet;

View File

@ -28,7 +28,6 @@ use db_cache::DatabaseCache;
use exact_attribute::ExactAttribute; use exact_attribute::ExactAttribute;
use graph_based_ranking_rule::{Exactness, Fid, Position, Proximity, Typo}; use graph_based_ranking_rule::{Exactness, Fid, Position, Proximity, Typo};
use heed::RoTxn; use heed::RoTxn;
use hnsw::Searcher;
use interner::{DedupInterner, Interner}; use interner::{DedupInterner, Interner};
pub use logger::visual::VisualSearchLogger; pub use logger::visual::VisualSearchLogger;
pub use logger::{DefaultSearchLogger, SearchLogger}; pub use logger::{DefaultSearchLogger, SearchLogger};
@ -40,7 +39,7 @@ use ranking_rules::{
use resolve_query_graph::{compute_query_graph_docids, PhraseDocIdsCache}; use resolve_query_graph::{compute_query_graph_docids, PhraseDocIdsCache};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use sort::Sort; use sort::Sort;
use space::Neighbor; use space::{KnnMap, Neighbor};
use self::geo_sort::GeoSort; use self::geo_sort::GeoSort;
pub use self::geo_sort::Strategy as GeoSortStrategy; pub use self::geo_sort::Strategy as GeoSortStrategy;
@ -48,9 +47,7 @@ use self::graph_based_ranking_rule::Words;
use self::interner::Interned; use self::interner::Interned;
use crate::score_details::{ScoreDetails, ScoringStrategy}; use crate::score_details::{ScoreDetails, ScoringStrategy};
use crate::search::new::distinct::apply_distinct_rule; use crate::search::new::distinct::apply_distinct_rule;
use crate::{ use crate::{AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError};
AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError, BEU32,
};
/// A structure used throughout the execution of a search query. /// A structure used throughout the execution of a search query.
pub struct SearchContext<'ctx> { pub struct SearchContext<'ctx> {
@ -450,26 +447,15 @@ pub fn execute_search(
let docids = match vector { let docids = match vector {
Some(vector) => { Some(vector) => {
// return the nearest documents that are also part of the candidates. // return the nearest documents that are also part of the candidates.
let mut searcher = Searcher::new(); let hgg = ctx.index.vector_hgg(ctx.txn)?.unwrap_or_default();
let hnsw = ctx.index.vector_hnsw(ctx.txn)?.unwrap_or_default(); hgg.knn_values(&vector, 100)
let ef = hnsw.len().min(100); .into_iter()
let mut dest = vec![Neighbor { index: 0, distance: 0 }; ef]; .filter(|(Neighbor { distance, .. }, docid)| {
let neighbors = hnsw.nearest(&vector, ef, &mut searcher, &mut dest[..]); dbg!(distance, f32::from_bits(*distance));
universe.contains(**docid)
let mut docids = Vec::new(); })
for Neighbor { index, distance } in neighbors.iter() { .map(|(_, docid)| *docid)
let index = BEU32::new(*index as u32); .collect()
let docid = ctx.index.vector_id_docid.get(ctx.txn, &index)?.unwrap().get();
dbg!(distance, f32::from_bits(*distance));
if universe.contains(docid) {
docids.push(docid);
if docids.len() == length {
break;
}
}
}
docids
} }
// return the search docids if the vector field is not specified // return the search docids if the vector field is not specified
None => docids, None => docids,

View File

@ -39,7 +39,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
facet_id_is_empty_docids, facet_id_is_empty_docids,
field_id_docid_facet_f64s, field_id_docid_facet_f64s,
field_id_docid_facet_strings, field_id_docid_facet_strings,
vector_id_docid,
documents, documents,
} = self.index; } = self.index;
@ -58,7 +57,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?; self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?;
self.index.delete_geo_rtree(self.wtxn)?; self.index.delete_geo_rtree(self.wtxn)?;
self.index.delete_geo_faceted_documents_ids(self.wtxn)?; self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
self.index.delete_vector_hnsw(self.wtxn)?; self.index.delete_vector_hgg(self.wtxn)?;
// We clean all the faceted documents ids. // We clean all the faceted documents ids.
for field_id in faceted_fields { for field_id in faceted_fields {
@ -97,7 +96,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
facet_id_string_docids.clear(self.wtxn)?; facet_id_string_docids.clear(self.wtxn)?;
field_id_docid_facet_f64s.clear(self.wtxn)?; field_id_docid_facet_f64s.clear(self.wtxn)?;
field_id_docid_facet_strings.clear(self.wtxn)?; field_id_docid_facet_strings.clear(self.wtxn)?;
vector_id_docid.clear(self.wtxn)?;
documents.clear(self.wtxn)?; documents.clear(self.wtxn)?;
Ok(number_of_documents) Ok(number_of_documents)

View File

@ -240,7 +240,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
facet_id_exists_docids, facet_id_exists_docids,
facet_id_is_null_docids, facet_id_is_null_docids,
facet_id_is_empty_docids, facet_id_is_empty_docids,
vector_id_docid,
documents, documents,
} = self.index; } = self.index;
// Remove from the documents database // Remove from the documents database
@ -275,6 +274,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
&mut words_to_delete, &mut words_to_delete,
)?; )?;
todo!("delete the documents from the Hgg datastructure");
// We construct an FST set that contains the words to delete from the words FST. // We construct an FST set that contains the words to delete from the words FST.
let words_to_delete = fst::Set::from_iter(words_to_delete.difference(&words_to_keep))?; let words_to_delete = fst::Set::from_iter(words_to_delete.difference(&words_to_keep))?;

View File

@ -9,8 +9,8 @@ use charabia::{Language, Script};
use grenad::MergerBuilder; use grenad::MergerBuilder;
use heed::types::ByteSlice; use heed::types::ByteSlice;
use heed::RwTxn; use heed::RwTxn;
use hnsw::Searcher;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use space::KnnInsert;
use super::helpers::{ use super::helpers::{
self, merge_ignore_values, serialize_roaring_bitmap, valid_lmdb_key, CursorClonableMmap, self, merge_ignore_values, serialize_roaring_bitmap, valid_lmdb_key, CursorClonableMmap,
@ -19,7 +19,7 @@ use super::{ClonableMmap, MergeFn};
use crate::facet::FacetType; use crate::facet::FacetType;
use crate::update::facet::FacetsUpdate; use crate::update::facet::FacetsUpdate;
use crate::update::index_documents::helpers::as_cloneable_grenad; use crate::update::index_documents::helpers::as_cloneable_grenad;
use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, BEU32}; use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result};
pub(crate) enum TypedChunk { pub(crate) enum TypedChunk {
FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>), FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
@ -225,19 +225,16 @@ pub(crate) fn write_typed_chunk_into_index(
index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?; index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
} }
TypedChunk::VectorPoints(vector_points) => { TypedChunk::VectorPoints(vector_points) => {
let mut hnsw = index.vector_hnsw(wtxn)?.unwrap_or_default(); let mut hgg = index.vector_hgg(wtxn)?.unwrap_or_default();
let mut searcher = Searcher::new();
let mut cursor = vector_points.into_cursor()?; let mut cursor = vector_points.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? { while let Some((key, value)) = cursor.move_on_next()? {
// convert the key back to a u32 (4 bytes) // convert the key back to a u32 (4 bytes)
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
// convert the vector back to a Vec<f32> // convert the vector back to a Vec<f32> and insert it.
let vector: Vec<f32> = pod_collect_to_vec(value); // TODO enable again when the library is fixed
let vector_id = hnsw.insert(vector, &mut searcher) as u32; hgg.insert(pod_collect_to_vec(value), docid);
index.vector_id_docid.put(wtxn, &BEU32::new(vector_id), &BEU32::new(docid))?;
} }
index.put_vector_hnsw(wtxn, &hnsw)?; index.put_vector_hgg(wtxn, &hgg)?;
} }
TypedChunk::ScriptLanguageDocids(hash_pair) => { TypedChunk::ScriptLanguageDocids(hash_pair) => {
let mut buffer = Vec::new(); let mut buffer = Vec::new();