mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-22 18:17:39 +08:00
Index the intra documents word pair proximities
This commit is contained in:
parent
7b67ae6972
commit
d6fa9c0414
1
Cargo.lock
generated
1
Cargo.lock
generated
@ -968,6 +968,7 @@ dependencies = [
|
|||||||
"fst",
|
"fst",
|
||||||
"fxhash",
|
"fxhash",
|
||||||
"heed",
|
"heed",
|
||||||
|
"itertools",
|
||||||
"jemallocator",
|
"jemallocator",
|
||||||
"levenshtein_automata",
|
"levenshtein_automata",
|
||||||
"log 0.4.11",
|
"log 0.4.11",
|
||||||
|
@ -30,6 +30,9 @@ smallvec = "1.4.0"
|
|||||||
structopt = { version = "0.3.14", default-features = false }
|
structopt = { version = "0.3.14", default-features = false }
|
||||||
tempfile = "3.1.0"
|
tempfile = "3.1.0"
|
||||||
|
|
||||||
|
# documents words self-join
|
||||||
|
itertools = "0.9.0"
|
||||||
|
|
||||||
# logging
|
# logging
|
||||||
log = "0.4.11"
|
log = "0.4.11"
|
||||||
stderrlog = "0.4.3"
|
stderrlog = "0.4.3"
|
||||||
|
@ -37,6 +37,7 @@ const WORDS_FST_KEY: &[u8] = b"\x06words-fst";
|
|||||||
const HEADERS_BYTE: u8 = 0;
|
const HEADERS_BYTE: u8 = 0;
|
||||||
const WORD_DOCID_POSITIONS_BYTE: u8 = 1;
|
const WORD_DOCID_POSITIONS_BYTE: u8 = 1;
|
||||||
const WORD_DOCIDS_BYTE: u8 = 2;
|
const WORD_DOCIDS_BYTE: u8 = 2;
|
||||||
|
const WORDS_PROXIMITIES_BYTE: u8 = 5;
|
||||||
const DOCUMENTS_IDS_BYTE: u8 = 4;
|
const DOCUMENTS_IDS_BYTE: u8 = 4;
|
||||||
|
|
||||||
#[cfg(target_os = "linux")]
|
#[cfg(target_os = "linux")]
|
||||||
@ -128,6 +129,35 @@ fn create_writer(type_: CompressionType, level: Option<u32>, file: File) -> Writ
|
|||||||
builder.build(file)
|
builder.build(file)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn compute_words_pair_proximities(
|
||||||
|
word_positions: &HashMap<String, RoaringBitmap>,
|
||||||
|
) -> HashMap<(&str, &str), RoaringBitmap>
|
||||||
|
{
|
||||||
|
use itertools::Itertools;
|
||||||
|
|
||||||
|
let mut words_pair_proximities = HashMap::new();
|
||||||
|
for (w1, w2) in word_positions.keys().cartesian_product(word_positions.keys()) {
|
||||||
|
let mut distances = RoaringBitmap::new();
|
||||||
|
let positions1: Vec<_> = word_positions[w1].iter().collect();
|
||||||
|
let positions2: Vec<_> = word_positions[w2].iter().collect();
|
||||||
|
for (ps1, ps2) in positions1.iter().cartesian_product(positions2.iter()) {
|
||||||
|
let prox = milli::proximity::positions_proximity(*ps1, *ps2);
|
||||||
|
// We don't care about a word that appear at the
|
||||||
|
// same position or too far from the other.
|
||||||
|
if prox > 0 && prox < 8 { distances.insert(prox); }
|
||||||
|
}
|
||||||
|
if !distances.is_empty() {
|
||||||
|
// We only store the proximites under one word pair.
|
||||||
|
let (w1, w2) = if w1 > w2 { (w2, w1) } else { (w1, w2) };
|
||||||
|
words_pair_proximities.entry((w1.as_str(), w2.as_str()))
|
||||||
|
.or_insert_with(RoaringBitmap::new)
|
||||||
|
.union_with(&distances);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
words_pair_proximities
|
||||||
|
}
|
||||||
|
|
||||||
type MergeFn = fn(&[u8], &[Vec<u8>]) -> Result<Vec<u8>, ()>;
|
type MergeFn = fn(&[u8], &[Vec<u8>]) -> Result<Vec<u8>, ()>;
|
||||||
|
|
||||||
struct Store {
|
struct Store {
|
||||||
@ -213,6 +243,43 @@ impl Store {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// FIXME We must store those pairs in an ArcCache to reduce the number of I/O operations,
|
||||||
|
// We must store the documents ids associated with the words pairs and proximities.
|
||||||
|
fn write_words_proximities(
|
||||||
|
sorter: &mut Sorter<MergeFn>,
|
||||||
|
document_id: DocumentId,
|
||||||
|
words_pair_proximities: &HashMap<(&str, &str), RoaringBitmap>,
|
||||||
|
) -> anyhow::Result<()>
|
||||||
|
{
|
||||||
|
// words proximities keys are all prefixed
|
||||||
|
let mut key = vec![WORDS_PROXIMITIES_BYTE];
|
||||||
|
let mut buffer = Vec::new();
|
||||||
|
|
||||||
|
for ((w1, w2), proximities) in words_pair_proximities {
|
||||||
|
assert!(w1 <= w2);
|
||||||
|
key.truncate(1);
|
||||||
|
key.extend_from_slice(w1.as_bytes());
|
||||||
|
key.push(0);
|
||||||
|
key.extend_from_slice(w2.as_bytes());
|
||||||
|
let pair_len = key.len();
|
||||||
|
for prox in proximities {
|
||||||
|
key.truncate(pair_len);
|
||||||
|
key.push(u8::try_from(prox).unwrap());
|
||||||
|
// We serialize the document ids into a buffer
|
||||||
|
buffer.clear();
|
||||||
|
let ids = RoaringBitmap::from_iter(Some(document_id));
|
||||||
|
buffer.reserve(ids.serialized_size());
|
||||||
|
ids.serialize_into(&mut buffer)?;
|
||||||
|
// that we write under the generated key into MTBL
|
||||||
|
if lmdb_key_valid_size(&key) {
|
||||||
|
sorter.insert(&key, &buffer)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
fn write_docid_word_positions(
|
fn write_docid_word_positions(
|
||||||
sorter: &mut Sorter<MergeFn>,
|
sorter: &mut Sorter<MergeFn>,
|
||||||
id: DocumentId,
|
id: DocumentId,
|
||||||
@ -307,6 +374,9 @@ impl Store {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let words_pair_proximities = compute_words_pair_proximities(&word_positions);
|
||||||
|
Self::write_words_proximities(&mut self.sorter, document_id, &words_pair_proximities)?;
|
||||||
|
|
||||||
// We write the document in the documents store.
|
// We write the document in the documents store.
|
||||||
self.write_document(document_id, &word_positions, &document)?;
|
self.write_document(document_id, &word_positions, &document)?;
|
||||||
word_positions.clear();
|
word_positions.clear();
|
||||||
@ -386,7 +456,7 @@ fn merge(key: &[u8], values: &[Vec<u8>]) -> Result<Vec<u8>, ()> {
|
|||||||
assert!(values.windows(2).all(|vs| vs[0] == vs[1]));
|
assert!(values.windows(2).all(|vs| vs[0] == vs[1]));
|
||||||
Ok(values[0].to_vec())
|
Ok(values[0].to_vec())
|
||||||
},
|
},
|
||||||
DOCUMENTS_IDS_BYTE | WORD_DOCIDS_BYTE => {
|
DOCUMENTS_IDS_BYTE | WORD_DOCIDS_BYTE | WORDS_PROXIMITIES_BYTE => {
|
||||||
let (head, tail) = values.split_first().unwrap();
|
let (head, tail) = values.split_first().unwrap();
|
||||||
|
|
||||||
let mut head = RoaringBitmap::deserialize_from(head.as_slice()).unwrap();
|
let mut head = RoaringBitmap::deserialize_from(head.as_slice()).unwrap();
|
||||||
@ -428,6 +498,10 @@ fn lmdb_writer(wtxn: &mut heed::RwTxn, index: &Index, key: &[u8], val: &[u8]) ->
|
|||||||
// Write the postings lists
|
// Write the postings lists
|
||||||
index.docid_word_positions.as_polymorph()
|
index.docid_word_positions.as_polymorph()
|
||||||
.put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?;
|
.put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?;
|
||||||
|
} else if key.starts_with(&[WORDS_PROXIMITIES_BYTE]) {
|
||||||
|
// Write the word pair proximity document ids
|
||||||
|
index.word_pair_proximity_docids.as_polymorph()
|
||||||
|
.put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
@ -17,8 +17,8 @@ use heed::{PolyDatabase, Database};
|
|||||||
pub use self::search::{Search, SearchResult};
|
pub use self::search::{Search, SearchResult};
|
||||||
pub use self::criterion::{Criterion, default_criteria};
|
pub use self::criterion::{Criterion, default_criteria};
|
||||||
pub use self::heed_codec::{
|
pub use self::heed_codec::{
|
||||||
RoaringBitmapCodec, BEU32StrCodec, CsvStringRecordCodec,
|
RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec,
|
||||||
ByteorderXRoaringBitmapCodec,
|
CsvStringRecordCodec, ByteorderXRoaringBitmapCodec,
|
||||||
};
|
};
|
||||||
|
|
||||||
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
|
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
|
||||||
@ -45,6 +45,8 @@ pub struct Index {
|
|||||||
pub docid_word_positions: Database<BEU32StrCodec, ByteorderXRoaringBitmapCodec>,
|
pub docid_word_positions: Database<BEU32StrCodec, ByteorderXRoaringBitmapCodec>,
|
||||||
/// Maps the document id to the document as a CSV line.
|
/// Maps the document id to the document as a CSV line.
|
||||||
pub documents: Database<OwnedType<BEU32>, ByteSlice>,
|
pub documents: Database<OwnedType<BEU32>, ByteSlice>,
|
||||||
|
/// Maps the proximity between a pair of words with all the docids where this relation appears.
|
||||||
|
pub word_pair_proximity_docids: Database<StrStrU8Codec, RoaringBitmapCodec>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Index {
|
impl Index {
|
||||||
@ -54,6 +56,7 @@ impl Index {
|
|||||||
word_docids: env.create_database(Some("word-docids"))?,
|
word_docids: env.create_database(Some("word-docids"))?,
|
||||||
docid_word_positions: env.create_database(Some("docid-word-positions"))?,
|
docid_word_positions: env.create_database(Some("docid-word-positions"))?,
|
||||||
documents: env.create_database(Some("documents"))?,
|
documents: env.create_database(Some("documents"))?,
|
||||||
|
word_pair_proximity_docids: env.create_database(Some("word-pair-proximity-docids"))?,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::{HashMap, HashSet};
|
||||||
use std::cmp;
|
|
||||||
|
|
||||||
use fst::{IntoStreamer, Streamer};
|
use fst::{IntoStreamer, Streamer};
|
||||||
use levenshtein_automata::DFA;
|
use levenshtein_automata::DFA;
|
||||||
@ -12,7 +11,7 @@ use near_proximity::near_proximity;
|
|||||||
|
|
||||||
use crate::proximity::path_proximity;
|
use crate::proximity::path_proximity;
|
||||||
use crate::query_tokens::{QueryTokens, QueryToken};
|
use crate::query_tokens::{QueryTokens, QueryToken};
|
||||||
use crate::{Index, DocumentId, Position};
|
use crate::{Index, DocumentId};
|
||||||
|
|
||||||
// Building these factories is not free.
|
// Building these factories is not free.
|
||||||
static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true));
|
static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true));
|
||||||
|
Loading…
Reference in New Issue
Block a user