Implement test Context methods

This commit is contained in:
many 2021-03-18 13:49:55 +01:00
parent 4ff67ec2ee
commit 75e7b1e3da
No known key found for this signature in database
GPG Key ID: 2CEF23B75189EACA

View File

@ -366,6 +366,7 @@ pub mod test {
word_prefix_docids: HashMap<String, RoaringBitmap>, word_prefix_docids: HashMap<String, RoaringBitmap>,
word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>,
word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>,
docid_words: HashMap<u32, Vec<String>>,
} }
impl<'a> Context for TestContext<'a> { impl<'a> Context for TestContext<'a> {
@ -399,8 +400,17 @@ pub mod test {
self.word_prefix_docids.contains_key(&word.to_string()) self.word_prefix_docids.contains_key(&word.to_string())
} }
fn docid_words_positions(&self, _docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>> { fn docid_words_positions(&self, docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>> {
todo!() if let Some(docid_words) = self.docid_words.get(&docid) {
Ok(docid_words
.iter()
.enumerate()
.map(|(i,w)| (w.clone(), RoaringBitmap::from_sorted_iter(std::iter::once(i as u32))))
.collect()
)
} else {
Ok(HashMap::new())
}
} }
} }
@ -435,50 +445,58 @@ pub mod test {
s("morning") => random_postings(rng, 125), s("morning") => random_postings(rng, 125),
}; };
let mut docid_words = HashMap::new();
for (word, docids) in word_docids.iter() {
for docid in docids {
let words = docid_words.entry(docid).or_insert(vec![]);
words.push(word.clone());
}
}
let word_prefix_docids = hashmap!{ let word_prefix_docids = hashmap!{
s("h") => &word_docids[&s("hello")] | &word_docids[&s("hi")], s("h") => &word_docids[&s("hello")] | &word_docids[&s("hi")],
s("wor") => &word_docids[&s("word")] | &word_docids[&s("world")], s("wor") => &word_docids[&s("word")] | &word_docids[&s("world")],
s("20") => &word_docids[&s("2020")] | &word_docids[&s("2021")], s("20") => &word_docids[&s("2020")] | &word_docids[&s("2021")],
}; };
let hello_world = &word_docids[&s("hello")] & &word_docids[&s("world")]; let mut word_pair_proximity_docids = HashMap::new();
let hello_world_split = (hello_world.len() / 2) as usize; let mut word_prefix_pair_proximity_docids = HashMap::new();
let hello_world_1 = hello_world.iter().take(hello_world_split).collect(); for (lword, lcandidates) in &word_docids {
let hello_world_2 = hello_world.iter().skip(hello_world_split).collect(); for (rword, rcandidates) in &word_docids {
if lword == rword { continue }
let hello_word = &word_docids[&s("hello")] & &word_docids[&s("word")]; let candidates = lcandidates & rcandidates;
let hello_word_split = (hello_word.len() / 2) as usize; for candidate in candidates {
let hello_word_4 = hello_word.iter().take(hello_word_split).collect(); if let Some(docid_words) = docid_words.get(&candidate) {
let hello_word_6 = hello_word.iter().skip(hello_word_split).take(hello_word_split/2).collect(); let lposition = docid_words.iter().position(|w| w == lword).unwrap();
let hello_word_7 = hello_word.iter().skip(hello_word_split + hello_word_split/2).collect(); let rposition = docid_words.iter().position(|w| w == rword).unwrap();
let word_pair_proximity_docids = hashmap!{ let key = if lposition < rposition {
(s("good"), s("morning"), 1) => &word_docids[&s("good")] & &word_docids[&s("morning")], (s(lword), s(rword), (rposition - lposition) as i32)
(s("hello"), s("world"), 1) => hello_world_1, } else {
(s("hello"), s("world"), 4) => hello_world_2, (s(lword), s(rword), (lposition - rposition + 1) as i32)
(s("this"), s("is"), 1) => &word_docids[&s("this")] & &word_docids[&s("is")],
(s("is"), s("2021"), 1) => &word_docids[&s("this")] & &word_docids[&s("is")] & &word_docids[&s("2021")],
(s("is"), s("2020"), 1) => &word_docids[&s("this")] & &word_docids[&s("is")] & (&word_docids[&s("2020")] - &word_docids[&s("2021")]),
(s("this"), s("2021"), 2) => &word_docids[&s("this")] & &word_docids[&s("is")] & &word_docids[&s("2021")],
(s("this"), s("2020"), 2) => &word_docids[&s("this")] & &word_docids[&s("is")] & (&word_docids[&s("2020")] - &word_docids[&s("2021")]),
(s("word"), s("split"), 1) => &word_docids[&s("word")] & &word_docids[&s("split")],
(s("world"), s("split"), 1) => (&word_docids[&s("world")] & &word_docids[&s("split")]) - &word_docids[&s("word")],
(s("hello"), s("word"), 4) => hello_word_4,
(s("hello"), s("word"), 6) => hello_word_6,
(s("hello"), s("word"), 7) => hello_word_7,
(s("split"), s("ngrams"), 3) => (&word_docids[&s("split")] & &word_docids[&s("ngrams")]) - &word_docids[&s("word")],
(s("split"), s("ngrams"), 5) => &word_docids[&s("split")] & &word_docids[&s("ngrams")] & &word_docids[&s("word")],
(s("this"), s("ngrams"), 1) => (&word_docids[&s("split")] & &word_docids[&s("this")] & &word_docids[&s("ngrams")] ) - &word_docids[&s("word")],
(s("this"), s("ngrams"), 2) => &word_docids[&s("split")] & &word_docids[&s("this")] & &word_docids[&s("ngrams")] & &word_docids[&s("word")],
}; };
let docids = word_pair_proximity_docids.entry(key).or_insert(RoaringBitmap::new());
let word_prefix_pair_proximity_docids = hashmap!{ docids.push(candidate);
(s("hello"), s("wor"), 1) => word_pair_proximity_docids.get(&(s("hello"), s("world"), 1)).unwrap().clone(), }
(s("hello"), s("wor"), 4) => word_pair_proximity_docids.get(&(s("hello"), s("world"), 4)).unwrap() | word_pair_proximity_docids.get(&(s("hello"), s("word"), 4)).unwrap(), }
(s("hello"), s("wor"), 6) => word_pair_proximity_docids.get(&(s("hello"), s("word"), 6)).unwrap().clone(), }
(s("hello"), s("wor"), 7) => word_pair_proximity_docids.get(&(s("hello"), s("word"), 7)).unwrap().clone(), for (pword, pcandidates) in &word_prefix_docids {
(s("is"), s("20"), 1) => word_pair_proximity_docids.get(&(s("is"), s("2020"), 1)).unwrap() | word_pair_proximity_docids.get(&(s("is"), s("2021"), 1)).unwrap(), if lword.starts_with(pword) { continue }
(s("this"), s("20"), 2) => word_pair_proximity_docids.get(&(s("this"), s("2020"), 2)).unwrap() | word_pair_proximity_docids.get(&(s("this"), s("2021"), 2)).unwrap(), let candidates = lcandidates & pcandidates;
for candidate in candidates {
if let Some(docid_words) = docid_words.get(&candidate) {
let lposition = docid_words.iter().position(|w| w == lword).unwrap();
let rposition = docid_words.iter().position(|w| w.starts_with(pword)).unwrap();
let key = if lposition < rposition {
(s(lword), s(pword), (rposition - lposition) as i32)
} else {
(s(lword), s(pword), (lposition - rposition + 1) as i32)
}; };
let docids = word_prefix_pair_proximity_docids.entry(key).or_insert(RoaringBitmap::new());
docids.push(candidate);
}
}
}
}
let mut keys = word_docids.keys().collect::<Vec<_>>(); let mut keys = word_docids.keys().collect::<Vec<_>>();
keys.sort_unstable(); keys.sort_unstable();
@ -490,6 +508,7 @@ pub mod test {
word_prefix_docids, word_prefix_docids,
word_pair_proximity_docids, word_pair_proximity_docids,
word_prefix_pair_proximity_docids, word_prefix_pair_proximity_docids,
docid_words,
} }
} }
} }