2023-04-13 19:45:34 +08:00
|
|
|
use std::error::Error;
|
2023-03-22 21:50:41 +08:00
|
|
|
use std::io::stdin;
|
2023-04-13 19:45:34 +08:00
|
|
|
use std::path::Path;
|
2023-03-22 21:50:41 +08:00
|
|
|
use std::time::Instant;
|
|
|
|
|
|
|
|
use heed::EnvOpenOptions;
|
|
|
|
use milli::{
|
2023-11-15 22:46:37 +08:00
|
|
|
execute_search, filtered_universe, DefaultSearchLogger, GeoSortStrategy, Index, SearchContext,
|
2024-03-05 18:21:46 +08:00
|
|
|
SearchLogger, TermsMatchingStrategy, TimeBudget,
|
2023-03-22 21:50:41 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
#[global_allocator]
|
|
|
|
static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
|
|
|
|
|
|
|
|
fn main() -> Result<(), Box<dyn Error>> {
|
|
|
|
let mut args = std::env::args();
|
2023-03-27 23:49:43 +08:00
|
|
|
let program_name = args.next().expect("No program name");
|
|
|
|
let dataset = args.next().unwrap_or_else(|| {
|
2023-03-28 00:18:01 +08:00
|
|
|
panic!(
|
2023-03-27 23:49:43 +08:00
|
|
|
"Missing path to index. Usage: {} <PATH-TO-INDEX> [<logger-dir>] [print-documents]",
|
|
|
|
program_name
|
|
|
|
)
|
|
|
|
});
|
2023-04-11 17:56:31 +08:00
|
|
|
let detailed_logger_dir = args.next();
|
2023-03-27 23:49:43 +08:00
|
|
|
let print_documents: bool =
|
|
|
|
if let Some(arg) = args.next() { arg == "print-documents" } else { false };
|
2023-03-22 21:50:41 +08:00
|
|
|
|
|
|
|
let mut options = EnvOpenOptions::new();
|
|
|
|
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
|
|
|
|
|
|
|
let index = Index::new(options, dataset)?;
|
|
|
|
let txn = index.read_txn()?;
|
|
|
|
let mut query = String::new();
|
|
|
|
while stdin().read_line(&mut query)? > 0 {
|
2023-03-23 16:12:35 +08:00
|
|
|
for _ in 0..2 {
|
2023-03-27 23:49:43 +08:00
|
|
|
let mut default_logger = DefaultSearchLogger;
|
|
|
|
// FIXME: consider resetting the state of the logger between search executions as otherwise panics are possible.
|
|
|
|
// Workaround'd here by recreating the logger on each iteration of the loop
|
2023-04-11 17:56:31 +08:00
|
|
|
let mut detailed_logger = detailed_logger_dir
|
2023-03-27 23:49:43 +08:00
|
|
|
.as_ref()
|
2023-04-11 17:56:31 +08:00
|
|
|
.map(|logger_dir| (milli::VisualSearchLogger::default(), logger_dir));
|
2023-03-27 23:49:43 +08:00
|
|
|
let logger: &mut dyn SearchLogger<_> =
|
2023-04-11 17:56:31 +08:00
|
|
|
if let Some((detailed_logger, _)) = detailed_logger.as_mut() {
|
2023-03-27 23:49:43 +08:00
|
|
|
detailed_logger
|
|
|
|
} else {
|
|
|
|
&mut default_logger
|
|
|
|
};
|
2023-03-23 16:12:35 +08:00
|
|
|
|
2023-03-22 21:50:41 +08:00
|
|
|
let start = Instant::now();
|
2023-03-27 23:49:43 +08:00
|
|
|
|
2024-05-07 23:56:40 +08:00
|
|
|
let mut ctx = SearchContext::new(&index, &txn)?;
|
2024-04-09 18:03:03 +08:00
|
|
|
let universe = filtered_universe(ctx.index, ctx.txn, &None)?;
|
2023-11-15 22:46:37 +08:00
|
|
|
|
2023-03-22 21:50:41 +08:00
|
|
|
let docs = execute_search(
|
|
|
|
&mut ctx,
|
2023-11-15 22:46:37 +08:00
|
|
|
(!query.trim().is_empty()).then(|| query.trim()),
|
2023-03-22 21:50:41 +08:00
|
|
|
TermsMatchingStrategy::Last,
|
2023-06-07 00:26:33 +08:00
|
|
|
milli::score_details::ScoringStrategy::Skip,
|
2023-04-03 16:09:27 +08:00
|
|
|
false,
|
2023-11-15 22:46:37 +08:00
|
|
|
universe,
|
2023-03-28 22:35:46 +08:00
|
|
|
&None,
|
2024-06-11 23:39:35 +08:00
|
|
|
&None,
|
2023-04-13 19:45:34 +08:00
|
|
|
GeoSortStrategy::default(),
|
2023-03-22 21:50:41 +08:00
|
|
|
0,
|
|
|
|
20,
|
2023-03-27 23:49:43 +08:00
|
|
|
None,
|
2023-03-22 21:50:41 +08:00
|
|
|
&mut DefaultSearchLogger,
|
2023-03-27 23:49:43 +08:00
|
|
|
logger,
|
2024-03-05 18:21:46 +08:00
|
|
|
TimeBudget::max(),
|
2024-04-12 01:04:43 +08:00
|
|
|
None,
|
2024-07-23 20:09:27 +08:00
|
|
|
None,
|
2023-03-22 21:50:41 +08:00
|
|
|
)?;
|
2023-04-11 17:56:31 +08:00
|
|
|
if let Some((logger, dir)) = detailed_logger {
|
|
|
|
logger.finish(&mut ctx, Path::new(dir))?;
|
2023-03-27 23:49:43 +08:00
|
|
|
}
|
2023-03-22 21:50:41 +08:00
|
|
|
let elapsed = start.elapsed();
|
|
|
|
println!("new: {}us, docids: {:?}", elapsed.as_micros(), docs.documents_ids);
|
2023-03-27 23:49:43 +08:00
|
|
|
if print_documents {
|
|
|
|
let documents = index
|
|
|
|
.documents(&txn, docs.documents_ids.iter().copied())
|
|
|
|
.unwrap()
|
|
|
|
.into_iter()
|
|
|
|
.map(|(id, obkv)| {
|
|
|
|
let mut object = serde_json::Map::default();
|
|
|
|
for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() {
|
|
|
|
let value = obkv.get(fid).unwrap();
|
|
|
|
let value: serde_json::Value = serde_json::from_slice(value).unwrap();
|
|
|
|
object.insert(fid_name.to_owned(), value);
|
|
|
|
}
|
|
|
|
(id, serde_json::to_string_pretty(&object).unwrap())
|
|
|
|
})
|
|
|
|
.collect::<Vec<_>>();
|
2023-03-22 21:50:41 +08:00
|
|
|
|
2023-03-27 23:49:43 +08:00
|
|
|
for (id, document) in documents {
|
|
|
|
println!("{id}:");
|
|
|
|
println!("{document}");
|
|
|
|
}
|
2023-03-22 21:50:41 +08:00
|
|
|
|
2023-03-27 23:49:43 +08:00
|
|
|
let documents = index
|
|
|
|
.documents(&txn, docs.documents_ids.iter().copied())
|
|
|
|
.unwrap()
|
|
|
|
.into_iter()
|
|
|
|
.map(|(id, obkv)| {
|
|
|
|
let mut object = serde_json::Map::default();
|
|
|
|
for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() {
|
|
|
|
let value = obkv.get(fid).unwrap();
|
|
|
|
let value: serde_json::Value = serde_json::from_slice(value).unwrap();
|
|
|
|
object.insert(fid_name.to_owned(), value);
|
|
|
|
}
|
|
|
|
(id, serde_json::to_string_pretty(&object).unwrap())
|
|
|
|
})
|
|
|
|
.collect::<Vec<_>>();
|
|
|
|
println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids);
|
|
|
|
for (id, document) in documents {
|
|
|
|
println!("{id}:");
|
|
|
|
println!("{document}");
|
|
|
|
}
|
|
|
|
}
|
2023-03-22 21:50:41 +08:00
|
|
|
}
|
|
|
|
query.clear();
|
|
|
|
}
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
}
|