From 01c7d2de8fbc9e26a25713b841311942fcc3177f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 22 Mar 2023 14:50:41 +0100 Subject: [PATCH] Add example targets to the milli crate --- milli/examples/index.rs | 119 +++++++++++++++++++++++++++++++++++ milli/examples/search.rs | 124 +++++++++++++++++++++++++++++++++++++ milli/examples/settings.rs | 31 ++++++++++ 3 files changed, 274 insertions(+) create mode 100644 milli/examples/index.rs create mode 100644 milli/examples/search.rs create mode 100644 milli/examples/settings.rs diff --git a/milli/examples/index.rs b/milli/examples/index.rs new file mode 100644 index 000000000..17a62b31f --- /dev/null +++ b/milli/examples/index.rs @@ -0,0 +1,119 @@ +use std::{ + error::Error, + fs::File, + io::{BufRead, BufReader, Cursor, Seek}, + time::Duration, +}; + +use heed::EnvOpenOptions; +use milli::{ + documents::{DocumentsBatchBuilder, DocumentsBatchReader}, + update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}, + Criterion, Index, Object, +}; + +fn main() -> Result<(), Box> { + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + + let index = Index::new(options, "data_organizations").unwrap(); + let mut wtxn = index.write_txn().unwrap(); + + let primary_key = "uuid"; + // let searchable_fields = vec!["body", "title", "url"]; + // let searchable_fields = vec!["title", "overview"]; + let searchable_fields = + vec!["name", "primary_role", "city", "region", "country_code", "short_description"]; + // let filterable_fields = vec!["release_date", "genres"]; + + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut wtxn, &index, &config); + builder.set_primary_key(primary_key.to_owned()); + let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + // let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect(); + // builder.set_filterable_fields(filterable_fields); + + // builder.set_min_word_len_one_typo(5); + // builder.set_min_word_len_two_typos(100); + builder.set_criteria(vec![Criterion::Words, Criterion::Typo, Criterion::Proximity]); + builder.execute(|_| (), || false).unwrap(); + + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap(); + + let documents = documents_from( + // "/Users/meilisearch/Documents/milli2/benchmarks/datasets/movies.json", + "/Users/meilisearch/Documents/datasets/organizations.csv", + // "json" + "csv", + ); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + // let rtxn = index.read_txn().unwrap(); + + // let mut wtxn = index.write_txn().unwrap(); + // let config = IndexerConfig::default(); + // let indexing_config = IndexDocumentsConfig::default(); + // let builder = + // IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap(); + + // let documents = documents_from("test_doc.json", "json"); + // let (builder, user_error) = builder.add_documents(documents).unwrap(); + // user_error.unwrap(); + // builder.execute().unwrap(); + // wtxn.commit().unwrap(); + + // let _ = index.all_documents(&rtxn)?; + + // println!("done!"); + // std::thread::sleep(Duration::from_secs(100)); + + index.prepare_for_closing().wait(); + Ok(()) +} +fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader { + let reader = File::open(filename) + .unwrap_or_else(|_| panic!("could not find the dataset in: {}", filename)); + let reader = BufReader::new(reader); + let documents = match filetype { + "csv" => documents_from_csv(reader).unwrap(), + "json" => documents_from_json(reader).unwrap(), + "jsonl" => documents_from_jsonl(reader).unwrap(), + otherwise => panic!("invalid update format {:?}", otherwise), + }; + DocumentsBatchReader::from_reader(Cursor::new(documents)).unwrap() +} + +fn documents_from_jsonl(reader: impl BufRead) -> milli::Result> { + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + + for result in serde_json::Deserializer::from_reader(reader).into_iter::() { + let object = result.unwrap(); + documents.append_json_object(&object)?; + } + + documents.into_inner().map_err(Into::into) +} + +fn documents_from_json(reader: impl BufRead) -> milli::Result> { + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + + documents.append_json_array(reader)?; + + documents.into_inner().map_err(Into::into) +} + +fn documents_from_csv(reader: impl BufRead) -> milli::Result> { + let csv = csv::Reader::from_reader(reader); + + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + documents.append_csv(csv)?; + + documents.into_inner().map_err(Into::into) +} diff --git a/milli/examples/search.rs b/milli/examples/search.rs new file mode 100644 index 000000000..558f92bac --- /dev/null +++ b/milli/examples/search.rs @@ -0,0 +1,124 @@ +// use crate::allocator::ALLOC; +use std::error::Error; +use std::io::stdin; +use std::time::Instant; + +use heed::EnvOpenOptions; +use milli::{ + execute_search, DefaultSearchLogger, Index, Search, SearchContext, TermsMatchingStrategy, +}; + +#[global_allocator] +static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; + +fn main() -> Result<(), Box> { + // TODO: command line + let mut args = std::env::args(); + let _ = args.next().unwrap(); + let dataset = args.next().unwrap(); + + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + + // Query: + // disp: 20 + // + // dasp: 70 words + // dosp: 80 + // dasc: 80 + // + // + // daspouyerf + // daspojewkfb + + let index = Index::new(options, dataset)?; + let txn = index.read_txn()?; + let mut query = String::new(); + while stdin().read_line(&mut query)? > 0 { + for _ in 0..10 { + let start = Instant::now(); + // let mut logger = milli::DetailedSearchLogger::new("log"); + let mut ctx = SearchContext::new(&index, &txn); + let docs = execute_search( + &mut ctx, + query.trim(), + // what a the from which when there is + TermsMatchingStrategy::Last, + None, + 0, + 20, + &mut DefaultSearchLogger, + &mut DefaultSearchLogger, + // &mut logger, + )?; + // logger.write_d2_description(&mut ctx); + let elapsed = start.elapsed(); + println!("new: {}us, docids: {:?}", elapsed.as_micros(), docs.documents_ids); + + // let documents = index + // .documents(&txn, docs.documents_ids.iter().copied()) + // .unwrap() + // .into_iter() + // .map(|(id, obkv)| { + // let mut object = serde_json::Map::default(); + // for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { + // let value = obkv.get(fid).unwrap(); + // let value: serde_json::Value = serde_json::from_slice(value).unwrap(); + // object.insert(fid_name.to_owned(), value); + // } + // (id, serde_json::to_string_pretty(&object).unwrap()) + // }) + // .collect::>(); + + // println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids); + // for (id, document) in documents { + // println!("{id}:"); + // println!("{document}"); + // } + + let start = Instant::now(); + let mut s = Search::new(&txn, &index); + s.query( + // "which a the releases from poison by the government", + // "sun flower s are the best", + query.trim(), + ); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + // s.limit(1); + // s.criterion_implementation_strategy( + // milli::CriterionImplementationStrategy::OnlySetBased, + // ); + + let docs = s.execute().unwrap(); + let elapsed = start.elapsed(); + println!("old: {}us, docids: {:?}", elapsed.as_micros(), docs.documents_ids); + + // let documents = index + // .documents(&txn, docs.documents_ids.iter().copied()) + // .unwrap() + // .into_iter() + // .map(|(id, obkv)| { + // let mut object = serde_json::Map::default(); + // for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { + // let value = obkv.get(fid).unwrap(); + // let value: serde_json::Value = serde_json::from_slice(value).unwrap(); + // object.insert(fid_name.to_owned(), value); + // } + // (id, serde_json::to_string_pretty(&object).unwrap()) + // }) + // .collect::>(); + // println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids); + // for (id, document) in documents { + // println!("{id}:"); + // println!("{document}"); + // } + } + query.clear(); + } + // for (id, document) in documents { + // println!("{id}:"); + // // println!("{document}"); + // } + + Ok(()) +} diff --git a/milli/examples/settings.rs b/milli/examples/settings.rs new file mode 100644 index 000000000..fb9cf2789 --- /dev/null +++ b/milli/examples/settings.rs @@ -0,0 +1,31 @@ +// use big_s::S; +use heed::EnvOpenOptions; +// use maplit::hashset; +use milli::{ + update::{IndexerConfig, Settings}, + Criterion, Index, +}; + +fn main() { + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + + let index = Index::new(options, "data_wiki").unwrap(); + let mut wtxn = index.write_txn().unwrap(); + + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut wtxn, &index, &config); + + // builder.set_min_word_len_one_typo(5); + // builder.set_min_word_len_two_typos(7); + // builder.set_sortable_fields(hashset! { S("release_date") }); + builder.set_criteria(vec![ + Criterion::Words, + Criterion::Typo, + Criterion::Proximity, + // Criterion::Asc("release_date".to_owned()), + ]); + + builder.execute(|_| (), || false).unwrap(); + wtxn.commit().unwrap(); +}