Far better usage of rayon to fold indexed data

2024-11-26 12:05:05 +08:00 · 2020-05-30 19:56:57 +02:00 · 2020-05-30 19:56:57 +02:00 · 3a998cf39c
commit 3a998cf39c
parent 1237306ca8
3 changed files with 107 additions and 113 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -84,16 +84,6 @@ dependencies = [
 "unicode-width",
 ]
 [[package]]
 name = "crossbeam-channel"
 version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cced8691919c02aac3cb0a1bc2e9b73d89e832bf9a06fc579d4e71b68a2da061"
 dependencies = [
 "crossbeam-utils",
 "maybe-uninit",
 ]
 [[package]]
 name = "crossbeam-deque"
 version = "0.7.3"
@ -330,7 +320,6 @@ dependencies = [
 "anyhow",
 "bitpacking",
 "byteorder 1.3.4",
 "crossbeam-channel",
 "csv",
 "fst",
 "fxhash",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -9,7 +9,6 @@ anyhow = "1.0.28"
 bitpacking = "0.8.2"
 byteorder = "1.3.4"
 roaring = "0.5.2"
 crossbeam-channel = "0.4.2"
 csv = "1.1.3"
 fst = "0.4.3"
 fxhash = "0.2.1"
--- a/src/main.rs
+++ b/src/main.rs
@ -1,17 +1,16 @@
 use std::collections::hash_map::Entry;
 use std::collections::{HashMap, BTreeSet};
 use std::convert::TryFrom;
 use std::fs::File;
 use std::hash::BuildHasherDefault;
 use std::path::PathBuf;
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::thread;
 use anyhow::{ensure, Context};
 use roaring::RoaringBitmap;
 use crossbeam_channel::{select, Sender, Receiver};
 use fst::IntoStreamer;
 use fxhash::FxHasher32;
-use heed::{EnvOpenOptions, Database};
+use heed::{EnvOpenOptions, PolyDatabase, Database};
 use heed::types::*;
 use rayon::prelude::*;
 use slice_group_by::StrGroupBy;
@ -81,97 +80,67 @@ fn alphanumeric_tokens(string: &str) -> impl Iterator<Item = &str> {
    string.linear_group_by_key(|c| c.is_alphanumeric()).filter(is_alphanumeric)
 }
-enum MainKey {
+#[derive(Default)]
-    WordsFst(fst::Set<Vec<u8>>),
+struct Indexed {
-    Headers(Vec<u8>),
+    fst: fst::Set<Vec<u8>>,
    postings_ids: FastMap4<SmallString32, RoaringBitmap>,
    headers: Vec<u8>,
    documents: Vec<(DocumentId, Vec<u8>)>,
 }
-#[derive(Clone)]
+impl Indexed {
-struct DbSender {
+    fn merge_with(mut self, mut other: Indexed) -> Indexed {
    main: Sender<MainKey>,
    postings_ids: Sender<(SmallString32, RoaringBitmap)>,
    documents: Sender<(DocumentId, Vec<u8>)>,
 }
-struct DbReceiver {
+        // Union of the two FSTs
-    main: Receiver<MainKey>,
+        let op = fst::set::OpBuilder::new()
-    postings_ids: Receiver<(SmallString32, RoaringBitmap)>,
+            .add(self.fst.into_stream())
-    documents: Receiver<(DocumentId, Vec<u8>)>,
+            .add(other.fst.into_stream())
-}
+            .r#union();
-fn thread_channel() -> (DbSender, DbReceiver) {
+        let mut build = fst::SetBuilder::memory();
-    let (sd_main, rc_main) = crossbeam_channel::bounded(4);
+        build.extend_stream(op.into_stream()).unwrap();
-    let (sd_postings, rc_postings) = crossbeam_channel::bounded(10);
+        let fst = build.into_set();
    let (sd_documents, rc_documents) = crossbeam_channel::bounded(10);
-    let sender = DbSender { main: sd_main, postings_ids: sd_postings, documents: sd_documents };
+        // Merge the postings by unions
-    let receiver = DbReceiver { main: rc_main, postings_ids: rc_postings, documents: rc_documents };
+        for (word, mut postings) in other.postings_ids {
-
+            match self.postings_ids.entry(word) {
-    (sender, receiver)
+                Entry::Occupied(mut entry) => {
-}
+                    let old = entry.get();
-
+                    postings.union_with(&old);
-fn writer_thread(env: heed::Env, receiver: DbReceiver) -> anyhow::Result<()> {
+                    entry.insert(postings);
    let main = env.create_poly_database(None)?;
    let postings_ids: Database<Str, ByteSlice> = env.create_database(Some("postings-ids"))?;
    let documents: Database<OwnedType<BEU32>, ByteSlice> = env.create_database(Some("documents"))?;
    let mut wtxn = env.write_txn()?;
    loop {
        select! {
            recv(receiver.main) -> msg => {
                let msg = match msg {
                    Err(_) => break,
                    Ok(msg) => msg,
                };
                match msg {
                    MainKey::WordsFst(new_fst) => {
                        let old_value = main.get::<_, Str, ByteSlice>(&wtxn, "words-fst")?;
                        let new_value = union_words_fst(b"words-fst", old_value, &new_fst)
                            .context("error while do a words-fst union")?;
                        main.put::<_, Str, ByteSlice>(&mut wtxn, "words-fst", &new_value)?;
                },
-                    MainKey::Headers(headers) => {
+                Entry::Vacant(entry) => {
-                        if let Some(old_headers) = main.get::<_, Str, ByteSlice>(&wtxn, "headers")? {
+                    entry.insert(postings);
                            ensure!(old_headers == &*headers, "headers differs from the previous ones");
                        }
                        main.put::<_, Str, ByteSlice>(&mut wtxn, "headers", &headers)?;
                    },
                }
            },
            recv(receiver.postings_ids) -> msg => {
                let (word, postings) = match msg {
                    Err(_) => break,
                    Ok(msg) => msg,
                };
                let old_value = postings_ids.get(&wtxn, &word)?;
                let new_value = union_postings_ids(word.as_bytes(), old_value, postings)
                    .context("error while do a words-fst union")?;
                postings_ids.put(&mut wtxn, &word, &new_value)?;
            },
            recv(receiver.documents) -> msg => {
                let (id, content) = match msg {
                    Err(_) => break,
                    Ok(msg) => msg,
                };
                documents.put(&mut wtxn, &BEU32::new(id), &content)?;
                },
            }
        }
-    wtxn.commit()?;
+        // assert headers are valid
-    Ok(())
+        assert_eq!(self.headers, other.headers);
        // extend the documents
        self.documents.append(&mut other.documents);
        Indexed {
            fst,
            postings_ids: self.postings_ids,
            headers: self.headers,
            documents: self.documents,
        }
    }
 }
-fn index_csv(tid: usize, db_sender: DbSender, mut rdr: csv::Reader<File>) -> anyhow::Result<usize> {
+fn index_csv(
    tid: usize,
    mut rdr: csv::Reader<File>,
 ) -> anyhow::Result<Indexed>
 {
    const MAX_POSITION: usize = 1000;
    const MAX_ATTRIBUTES: usize = u32::max_value() as usize / MAX_POSITION;
    let mut document = csv::StringRecord::new();
-    let mut new_postings_ids = FastMap4::default();
+    let mut postings_ids = FastMap4::default();
-    let mut new_words = BTreeSet::default();
+    let mut documents = Vec::new();
    let mut number_of_documents = 0;
    // Write the headers into a Vec of bytes.
@ -179,7 +148,6 @@ fn index_csv(tid: usize, db_sender: DbSender, mut rdr: csv::Reader<File>) -> any
    let mut writer = csv::WriterBuilder::new().has_headers(false).from_writer(Vec::new());
    writer.write_byte_record(headers.as_byte_record())?;
    let headers = writer.into_inner()?;
    db_sender.main.send(MainKey::Headers(headers))?;
    while rdr.read_record(&mut document)? {
        let document_id = ID_GENERATOR.fetch_add(1, Ordering::SeqCst);
@ -188,7 +156,7 @@ fn index_csv(tid: usize, db_sender: DbSender, mut rdr: csv::Reader<File>) -> any
        for (_attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) {
            for (_pos, word) in alphanumeric_tokens(&content).enumerate().take(MAX_POSITION) {
                if !word.is_empty() && word.len() < 500 { // LMDB limits
-                    new_postings_ids.entry(SmallString32::from(word))
+                    postings_ids.entry(SmallString32::from(word))
                        .or_insert_with(RoaringBitmap::new)
                        .insert(document_id);
                }
@ -199,7 +167,7 @@ fn index_csv(tid: usize, db_sender: DbSender, mut rdr: csv::Reader<File>) -> any
        let mut writer = csv::WriterBuilder::new().has_headers(false).from_writer(Vec::new());
        writer.write_byte_record(document.as_byte_record())?;
        let document = writer.into_inner()?;
-        db_sender.documents.send((document_id, document))?;
+        documents.push((document_id, document));
        number_of_documents += 1;
        if number_of_documents % 100000 == 0 {
@ -207,26 +175,57 @@ fn index_csv(tid: usize, db_sender: DbSender, mut rdr: csv::Reader<File>) -> any
        }
    }
-    eprintln!("Start collecting the postings lists and words");
+    eprintln!("Start collecting the words into an FST");
    // We compute and store the postings list into the DB.
-    for (word, new_ids) in new_postings_ids {
+    let mut new_words = BTreeSet::default();
-        db_sender.postings_ids.send((word.clone(), new_ids))?;
+    for (word, _new_ids) in &postings_ids {
-        new_words.insert(word);
+        new_words.insert(word.clone());
    }
-    eprintln!("Finished collecting the postings lists and words");
+    let new_words_fst = fst::Set::from_iter(new_words.iter().map(SmallString32::as_str))?;
-    eprintln!("Start merging the words-fst");
+    eprintln!("Total number of documents seen so far is {}", ID_GENERATOR.load(Ordering::Relaxed));
-    let new_words_fst = fst::Set::from_iter(new_words.iter().map(|s| s.as_str()))?;
+    Ok(Indexed { fst: new_words_fst, headers, postings_ids, documents })
-    drop(new_words);
+}
    db_sender.main.send(MainKey::WordsFst(new_words_fst))?;
-    eprintln!("Finished merging the words-fst");
+fn writer(
-    eprintln!("Total number of documents seen is {}", ID_GENERATOR.load(Ordering::Relaxed));
+    wtxn: &mut heed::RwTxn,
    main: PolyDatabase,
    postings_ids: Database<Str, ByteSlice>,
    documents: Database<OwnedType<BEU32>, ByteSlice>,
    indexed: Indexed,
 ) -> anyhow::Result<usize>
 {
    // Write and merge the words fst
    let old_value = main.get::<_, Str, ByteSlice>(wtxn, "words-fst")?;
    let new_value = union_words_fst(b"words-fst", old_value, &indexed.fst)
        .context("error while do a words-fst union")?;
    main.put::<_, Str, ByteSlice>(wtxn, "words-fst", &new_value)?;
-    Ok(number_of_documents)
+    // Write and merge the headers
    if let Some(old_headers) = main.get::<_, Str, ByteSlice>(wtxn, "headers")? {
        ensure!(old_headers == &*indexed.headers, "headers differs from the previous ones");
    }
    main.put::<_, Str, ByteSlice>(wtxn, "headers", &indexed.headers)?;
    // Write and merge the postings lists
    for (word, postings) in indexed.postings_ids {
        let old_value = postings_ids.get(wtxn, word.as_str())?;
        let new_value = union_postings_ids(word.as_bytes(), old_value, postings)
            .context("error while do a words-fst union")?;
        postings_ids.put(wtxn, &word, &new_value)?;
    }
    let count = indexed.documents.len();
    // Write the documents
    for (id, content) in indexed.documents {
        documents.put(wtxn, &BEU32::new(id), &content)?;
    }
    Ok(count)
 }
 fn main() -> anyhow::Result<()> {
@ -239,22 +238,29 @@ fn main() -> anyhow::Result<()> {
        .max_dbs(5)
        .open(opt.database)?;
-    let (sender, receiver) = thread_channel();
+    let main = env.create_poly_database(None)?;
-    let writing_child = thread::spawn(move || writer_thread(env, receiver));
+    let postings_ids: Database<Str, ByteSlice> = env.create_database(Some("postings-ids"))?;
    let documents: Database<OwnedType<BEU32>, ByteSlice> = env.create_database(Some("documents"))?;
    let res = opt.files_to_index
        .into_par_iter()
        .enumerate()
-        .map(|(tid, path)| {
+        .try_fold(|| Indexed::default(), |acc, (tid, path)| {
            let rdr = csv::Reader::from_path(path)?;
-            index_csv(tid, sender.clone(), rdr)
+            let indexed = index_csv(tid, rdr)?;
            Ok(acc.merge_with(indexed)) as anyhow::Result<Indexed>
        })
        .map(|indexed| match indexed {
            Ok(indexed) => {
                let mut wtxn = env.write_txn()?;
                let count = writer(&mut wtxn, main, postings_ids, documents, indexed)?;
                wtxn.commit()?;
                Ok(count)
            },
            Err(e) => Err(e),
        })
        .try_reduce(|| 0, |a, b| Ok(a + b));
    eprintln!("witing the writing thread...");
    writing_child.join().unwrap().unwrap();
    println!("indexed {:?} documents", res);
    Ok(())