mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-12-02 18:15:38 +08:00
Index the word pair proximities for both orders of the pair
This commit is contained in:
parent
a58ae5eb2a
commit
31224a8425
@ -147,8 +147,6 @@ fn compute_words_pair_proximities(
|
|||||||
if prox > 0 && prox < 8 { distances.insert(prox); }
|
if prox > 0 && prox < 8 { distances.insert(prox); }
|
||||||
}
|
}
|
||||||
if !distances.is_empty() {
|
if !distances.is_empty() {
|
||||||
// We only store the proximites under one word pair.
|
|
||||||
let (w1, w2) = if w1 > w2 { (w2, w1) } else { (w1, w2) };
|
|
||||||
words_pair_proximities.entry((w1.as_str(), w2.as_str()))
|
words_pair_proximities.entry((w1.as_str(), w2.as_str()))
|
||||||
.or_insert_with(RoaringBitmap::new)
|
.or_insert_with(RoaringBitmap::new)
|
||||||
.union_with(&distances);
|
.union_with(&distances);
|
||||||
@ -256,7 +254,6 @@ impl Store {
|
|||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
|
|
||||||
for ((w1, w2), proximities) in words_pair_proximities {
|
for ((w1, w2), proximities) in words_pair_proximities {
|
||||||
assert!(w1 <= w2);
|
|
||||||
key.truncate(1);
|
key.truncate(1);
|
||||||
key.extend_from_slice(w1.as_bytes());
|
key.extend_from_slice(w1.as_bytes());
|
||||||
key.push(0);
|
key.push(0);
|
||||||
|
@ -74,7 +74,7 @@ enum Command {
|
|||||||
/// Outputs a CSV with the proximities for the two specidied words and
|
/// Outputs a CSV with the proximities for the two specidied words and
|
||||||
/// the documents ids where these relations appears.
|
/// the documents ids where these relations appears.
|
||||||
///
|
///
|
||||||
/// `word1`, `word2` defines the word pair specified and sorted.
|
/// `word1`, `word2` defines the word pair specified *in this specific order*.
|
||||||
/// `proximity` defines the proximity between the two specified words.
|
/// `proximity` defines the proximity between the two specified words.
|
||||||
/// `documents_ids` defines the documents ids where the relation appears.
|
/// `documents_ids` defines the documents ids where the relation appears.
|
||||||
WordPairProximitiesDocids {
|
WordPairProximitiesDocids {
|
||||||
@ -339,17 +339,15 @@ fn word_pair_proximities_docids(
|
|||||||
use heed::types::ByteSlice;
|
use heed::types::ByteSlice;
|
||||||
use milli::RoaringBitmapCodec;
|
use milli::RoaringBitmapCodec;
|
||||||
|
|
||||||
let (w1, w2) = if word1 > word2 { (word2, word1) } else { (word1, word2) };
|
|
||||||
|
|
||||||
let stdout = io::stdout();
|
let stdout = io::stdout();
|
||||||
let mut wtr = csv::Writer::from_writer(stdout.lock());
|
let mut wtr = csv::Writer::from_writer(stdout.lock());
|
||||||
wtr.write_record(&["word1", "word2", "proximity", "documents_ids"])?;
|
wtr.write_record(&["word1", "word2", "proximity", "documents_ids"])?;
|
||||||
|
|
||||||
// Create the prefix key with only the pair of words.
|
// Create the prefix key with only the pair of words.
|
||||||
let mut prefix = Vec::with_capacity(w1.len() + w2.len() + 1);
|
let mut prefix = Vec::with_capacity(word1.len() + word2.len() + 1);
|
||||||
prefix.extend_from_slice(w1.as_bytes());
|
prefix.extend_from_slice(word1.as_bytes());
|
||||||
prefix.push(0);
|
prefix.push(0);
|
||||||
prefix.extend_from_slice(w2.as_bytes());
|
prefix.extend_from_slice(word2.as_bytes());
|
||||||
|
|
||||||
let db = index.word_pair_proximity_docids.as_polymorph();
|
let db = index.word_pair_proximity_docids.as_polymorph();
|
||||||
let iter = db.prefix_iter::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &prefix)?;
|
let iter = db.prefix_iter::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &prefix)?;
|
||||||
@ -366,7 +364,7 @@ fn word_pair_proximities_docids(
|
|||||||
} else {
|
} else {
|
||||||
format!("{:?}", docids.iter().collect::<Vec<_>>())
|
format!("{:?}", docids.iter().collect::<Vec<_>>())
|
||||||
};
|
};
|
||||||
wtr.write_record(&[&w1, &w2, &proximity.to_string(), &docids])?;
|
wtr.write_record(&[&word1, &word2, &proximity.to_string(), &docids])?;
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(wtr.flush()?)
|
Ok(wtr.flush()?)
|
||||||
|
Loading…
Reference in New Issue
Block a user