diff --git a/Cargo.lock b/Cargo.lock index 1fa4b92e8..e5f05b909 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -93,6 +93,12 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "cow-utils" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79bb3adfaf5f75d24b01aee375f7555907840fa2800e5ec8fa3b9e2031830173" + [[package]] name = "crc32c" version = "0.4.0" @@ -391,6 +397,7 @@ dependencies = [ "anyhow", "bitpacking", "byteorder 1.3.4", + "cow-utils", "csv", "fst", "fxhash", diff --git a/Cargo.toml b/Cargo.toml index abe475a1d..45e71778f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,11 +3,13 @@ name = "mega-mini-indexer" version = "0.1.0" authors = ["Kerollmops "] edition = "2018" +default-run = "indexer" [dependencies] anyhow = "1.0.28" bitpacking = "0.8.2" byteorder = "1.3.4" +cow-utils = "0.1.2" csv = "1.1.3" fst = "0.4.3" fxhash = "0.2.1" diff --git a/src/main.rs b/src/bin/indexer.rs similarity index 93% rename from src/main.rs rename to src/bin/indexer.rs index 7a22d16d7..a7d2c01f1 100644 --- a/src/main.rs +++ b/src/bin/indexer.rs @@ -1,27 +1,21 @@ -use std::collections::{HashMap, BTreeSet}; -use std::convert::TryFrom; -use std::convert::TryInto; +use std::collections::BTreeSet; +use std::convert::{TryInto, TryFrom}; use std::fs::File; -use std::hash::BuildHasherDefault; use std::path::PathBuf; use std::sync::atomic::{AtomicUsize, Ordering}; use anyhow::Context; +use cow_utils::CowUtils; use fst::{Streamer, IntoStreamer}; -use fxhash::FxHasher32; use heed::types::*; use heed::{EnvOpenOptions, PolyDatabase, Database}; use oxidized_mtbl::{Reader, ReaderOptions, Writer, Merger, MergerOptions}; use rayon::prelude::*; use roaring::RoaringBitmap; -use slice_group_by::StrGroupBy; use structopt::StructOpt; -pub type FastMap4 = HashMap>; -pub type SmallString32 = smallstr::SmallString<[u8; 32]>; -pub type SmallVec32 = smallvec::SmallVec<[u8; 32]>; -pub type BEU32 = heed::zerocopy::U32; -pub type DocumentId = u32; +use mega_mini_indexer::alphanumeric_tokens; +use mega_mini_indexer::{FastMap4, SmallVec32, BEU32, DocumentId}; #[cfg(target_os = "linux")] #[global_allocator] @@ -41,11 +35,6 @@ struct Opt { files_to_index: Vec, } -fn alphanumeric_tokens(string: &str) -> impl Iterator { - let is_alphanumeric = |s: &&str| s.chars().next().map_or(false, char::is_alphanumeric); - string.linear_group_by_key(|c| c.is_alphanumeric()).filter(is_alphanumeric) -} - struct Indexed { fst: fst::Set>, postings_ids: FastMap4, @@ -181,6 +170,7 @@ fn index_csv(mut rdr: csv::Reader) -> anyhow::Result { for (_attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) { for (_pos, word) in alphanumeric_tokens(&content).enumerate().take(MAX_POSITION) { if !word.is_empty() && word.len() < 500 { // LMDB limits + let word = word.cow_to_lowercase(); postings_ids.entry(SmallVec32::from(word.as_bytes())) .or_insert_with(RoaringBitmap::new) .insert(document_id); diff --git a/src/bin/search.rs b/src/bin/search.rs new file mode 100644 index 000000000..aea12610c --- /dev/null +++ b/src/bin/search.rs @@ -0,0 +1,85 @@ +use std::io::{self, Write}; +use std::path::PathBuf; +use std::time::Instant; + +use cow_utils::CowUtils; +use heed::types::*; +use heed::{EnvOpenOptions, Database}; +use roaring::RoaringBitmap; +use structopt::StructOpt; + +use mega_mini_indexer::alphanumeric_tokens; +use mega_mini_indexer::BEU32; + +#[derive(Debug, StructOpt)] +#[structopt(name = "mm-indexer", about = "The server side of the daugt project.")] +struct Opt { + /// The database path where the database is located. + /// It is created if it doesn't already exist. + #[structopt(long = "db", parse(from_os_str))] + database: PathBuf, + + /// The query string to search for (doesn't support prefix search yet). + query: String, +} + +fn main() -> anyhow::Result<()> { + let opt = Opt::from_args(); + + std::fs::create_dir_all(&opt.database)?; + let env = EnvOpenOptions::new() + .map_size(100 * 1024 * 1024 * 1024) // 100 GB + .max_readers(10) + .max_dbs(5) + .open(opt.database)?; + + let main = env.create_poly_database(None)?; + let postings_ids: Database = env.create_database(Some("postings-ids"))?; + let documents: Database, ByteSlice> = env.create_database(Some("documents"))?; + + let rtxn = env.read_txn()?; + + let before = Instant::now(); + let mut result: Option = None; + for word in alphanumeric_tokens(&opt.query) { + let word = word.cow_to_lowercase(); + match postings_ids.get(&rtxn, &word)? { + Some(ids) => { + let before = Instant::now(); + let right = RoaringBitmap::deserialize_from(ids)?; + eprintln!("deserialized bitmap for {:?} took {:.02?}", word, before.elapsed()); + result = match result.take() { + Some(mut left) => { + let before = Instant::now(); + let left_len = left.len(); + left.intersect_with(&right); + eprintln!("intersect between {:?} and {:?} took {:.02?}", + left_len, right.len(), before.elapsed()); + Some(left) + }, + None => Some(right), + }; + }, + None => result = Some(RoaringBitmap::default()), + } + } + + let headers = match main.get::<_, Str, ByteSlice>(&rtxn, "headers")? { + Some(headers) => headers, + None => return Ok(()), + }; + + let mut stdout = io::stdout(); + stdout.write_all(&headers)?; + + let total_length = result.as_ref().map_or(0, |x| x.len()); + for id in result.unwrap_or_default().iter().take(20) { + if let Some(content) = documents.get(&rtxn, &BEU32::new(id))? { + stdout.write_all(&content)?; + } + } + + eprintln!("Took {:.02?} to find {} documents", before.elapsed(), total_length); + + Ok(()) +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 000000000..9e5baf1fd --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,16 @@ +use std::collections::HashMap; +use std::hash::BuildHasherDefault; + +use fxhash::FxHasher32; +use slice_group_by::StrGroupBy; + +pub type FastMap4 = HashMap>; +pub type SmallString32 = smallstr::SmallString<[u8; 32]>; +pub type SmallVec32 = smallvec::SmallVec<[u8; 32]>; +pub type BEU32 = heed::zerocopy::U32; +pub type DocumentId = u32; + +pub fn alphanumeric_tokens(string: &str) -> impl Iterator { + let is_alphanumeric = |s: &&str| s.chars().next().map_or(false, char::is_alphanumeric); + string.linear_group_by_key(|c| c.is_alphanumeric()).filter(is_alphanumeric) +}