From 0cd9e62fc6da00242f03af7aaf71f99cc379eeaa Mon Sep 17 00:00:00 2001 From: mpostma Date: Thu, 24 Dec 2020 12:58:34 +0100 Subject: [PATCH] search first iteration --- Cargo.lock | 139 +++++++++++++++++++++++++++++++++++- Cargo.toml | 6 +- src/data.rs | 157 ++++++++++++++++++++++++++++++++++++++++- src/routes/document.rs | 3 +- src/routes/search.rs | 68 ++++++++---------- 5 files changed, 329 insertions(+), 44 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 451b8efc0..c5cea3936 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -299,6 +299,12 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ee2a4ec343196209d6594e19543ae87a39f96d5534d7174822a3ad825dd6ed7e" +[[package]] +name = "ahash" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8fd72866655d1904d6b0997d0b07ba561047d070fbe29de039031c641b61217" + [[package]] name = "aho-corasick" version = "0.7.15" @@ -574,6 +580,15 @@ dependencies = [ "jobserver", ] +[[package]] +name = "cedarwood" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "963e82c7b94163808ca3a452608d260b64ba5bc7b5653b4af1af59887899f48d" +dependencies = [ + "smallvec", +] + [[package]] name = "cfg-if" version = "0.1.10" @@ -586,6 +601,15 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "character_converter" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e48477ece09d6a21c033cb604968524a37782532727055d6f6faafac1781e5c" +dependencies = [ + "bincode", +] + [[package]] name = "chrono" version = "0.4.19" @@ -645,6 +669,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2df960f5d869b2dd8532793fde43eb5427cceb126c929747a26823ab0eeb536" +[[package]] +name = "cow-utils" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79bb3adfaf5f75d24b01aee375f7555907840fa2800e5ec8fa3b9e2031830173" + [[package]] name = "cpuid-bool" version = "0.1.2" @@ -768,6 +798,12 @@ dependencies = [ "syn", ] +[[package]] +name = "deunicode" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "80115a2dfde04491e181c2440a39e4be26e52d9ca4e92bed213f65b94e0b8db1" + [[package]] name = "digest" version = "0.8.1" @@ -1127,6 +1163,16 @@ dependencies = [ "tracing-futures", ] +[[package]] +name = "hashbrown" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96282e96bfcd3da0d3aa9938bedf1e50df3269b6db08b4876d2da0bb1a0841cf" +dependencies = [ + "ahash", + "autocfg", +] + [[package]] name = "hashbrown" version = "0.9.1" @@ -1326,7 +1372,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4fb1fa934250de4de8aef298d81c729a7d33d8c239daa3a7575e6b92bfc7313b" dependencies = [ "autocfg", - "hashbrown", + "hashbrown 0.9.1", "serde", ] @@ -1402,6 +1448,21 @@ dependencies = [ "libc", ] +[[package]] +name = "jieba-rs" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34fbdeee8786790f4a99fa30ff5c5f88aa5183f7583693e3788d17fc8a48f33a" +dependencies = [ + "cedarwood", + "fxhash", + "hashbrown 0.9.1", + "lazy_static", + "phf", + "phf_codegen", + "regex", +] + [[package]] name = "jobserver" version = "0.1.21" @@ -1550,6 +1611,7 @@ dependencies = [ "crossbeam-channel", "env_logger 0.8.2", "flate2", + "fst", "futures", "futures-util", "grenad", @@ -1560,6 +1622,7 @@ dependencies = [ "log", "main_error", "meilisearch-error", + "meilisearch-tokenizer", "memmap", "milli", "mime", @@ -1587,6 +1650,22 @@ dependencies = [ "whoami", ] +[[package]] +name = "meilisearch-tokenizer" +version = "0.1.1" +source = "git+https://github.com/meilisearch/Tokenizer.git?branch=main#147b6154b1b34cb8f5da2df6a416b7da191bc850" +dependencies = [ + "character_converter", + "cow-utils", + "deunicode", + "fst", + "jieba-rs", + "once_cell", + "slice-group-by", + "unicode-segmentation", + "whatlang", +] + [[package]] name = "memchr" version = "2.3.4" @@ -1634,6 +1713,7 @@ dependencies = [ "levenshtein_automata", "linked-hash-map", "log", + "meilisearch-tokenizer", "memmap", "near-proximity", "num-traits", @@ -1938,6 +2018,44 @@ dependencies = [ "sha-1 0.8.2", ] +[[package]] +name = "phf" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526" +dependencies = [ + "phf_shared", + "rand 0.7.3", +] + +[[package]] +name = "phf_shared" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7" +dependencies = [ + "siphasher", +] + [[package]] name = "pin-project" version = "0.4.27" @@ -2101,6 +2219,7 @@ dependencies = [ "rand_chacha", "rand_core 0.5.1", "rand_hc", + "rand_pcg", ] [[package]] @@ -2146,6 +2265,15 @@ dependencies = [ "rand_core 0.5.1", ] +[[package]] +name = "rand_pcg" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429" +dependencies = [ + "rand_core 0.5.1", +] + [[package]] name = "rand_xoshiro" version = "0.4.0" @@ -3342,6 +3470,15 @@ dependencies = [ "webpki", ] +[[package]] +name = "whatlang" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc0289c1d1548414a5645e6583e118e9c569c579ec2a0c32417cc3dbf7a89075" +dependencies = [ + "hashbrown 0.7.2", +] + [[package]] name = "whoami" version = "1.0.1" diff --git a/Cargo.toml b/Cargo.toml index 4395b2694..fb8531a96 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,7 +26,9 @@ chrono = { version = "0.4.19", features = ["serde"] } crossbeam-channel = "0.5.0" env_logger = "0.8.2" flate2 = "1.0.19" +fst = "0.4.5" futures = "0.3.7" +futures-util = "0.3.8" grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" } heed = "0.10.6" http = "0.2.1" @@ -34,6 +36,8 @@ indexmap = { version = "1.3.2", features = ["serde-1"] } log = "0.4.8" main_error = "0.1.0" meilisearch-error = { path = "../MeiliSearch/meilisearch-error" } +meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", branch = "main" } +memmap = "0.7.0" milli = { path = "../milli" } mime = "0.3.16" once_cell = "1.5.2" @@ -54,8 +58,6 @@ tokio = { version = "0.2", features = ["full"] } ureq = { version = "1.5.1", default-features = false, features = ["tls"] } walkdir = "2.3.1" whoami = "1.0.0" -futures-util = "0.3.8" -memmap = "0.7.0" [dependencies.sentry] default-features = false diff --git a/src/data.rs b/src/data.rs index 0702d7364..48826a44f 100644 --- a/src/data.rs +++ b/src/data.rs @@ -1,17 +1,53 @@ +use std::borrow::Cow; +use std::collections::HashSet; +use std::fs::create_dir_all; +use std::mem; use std::ops::Deref; use std::sync::Arc; -use std::fs::create_dir_all; +use std::time::Instant; use async_compression::tokio_02::write::GzipEncoder; use futures_util::stream::StreamExt; use tokio::io::AsyncWriteExt; -use milli::Index; +use milli::{Index, SearchResult as Results, obkv_to_json}; use milli::update::{IndexDocumentsMethod, UpdateFormat}; use sha2::Digest; +use serde_json::{Value, Map}; +use serde::{Deserialize, Serialize}; +use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use crate::option::Opt; use crate::updates::{UpdateQueue, UpdateMeta, UpdateStatus, UpdateMetaProgress}; +const DEFAULT_SEARCH_LIMIT: usize = 20; + +#[derive(Deserialize)] +#[serde(rename_all = "camelCase", deny_unknown_fields)] +pub struct SearchQuery { + q: Option, + offset: Option, + limit: Option, + attributes_to_retrieve: Option>, + attributes_to_crop: Option>, + crop_length: Option, + attributes_to_highlight: Option>, + filters: Option, + matches: Option, + facet_filters: Option, + facets_distribution: Option>, +} + +#[derive(Serialize)] +#[serde(rename_all = "camelCase")] +pub struct SearchResult { + hits: Vec>, + nb_hits: usize, + query: String, + limit: usize, + offset: usize, + processing_time_ms: u128, +} + #[derive(Clone)] pub struct Data { inner: Arc, @@ -81,8 +117,9 @@ impl Data { Ok(Data { inner }) } - pub async fn add_documents( + pub async fn add_documents( &self, + _index: S, method: IndexDocumentsMethod, format: UpdateFormat, mut stream: impl futures::Stream> + Unpin, @@ -90,6 +127,7 @@ impl Data { where B: Deref, E: std::error::Error + Send + Sync + 'static, + S: AsRef, { let file = tokio::task::spawn_blocking(tempfile::tempfile).await?; let file = tokio::fs::File::from_std(file?); @@ -115,6 +153,60 @@ impl Data { Ok(UpdateStatus::Pending { update_id, meta }) } + pub fn search>(&self, _index: S, search_query: SearchQuery) -> anyhow::Result { + let start = Instant::now(); + let index = &self.indexes; + let rtxn = index.read_txn()?; + + let mut search = index.search(&rtxn); + if let Some(query) = &search_query.q { + search.query(query); + } + + if let Some(offset) = search_query.offset { + search.offset(offset); + } + + let limit = search_query.limit.unwrap_or(DEFAULT_SEARCH_LIMIT); + search.limit(limit); + + let Results { found_words, documents_ids, nb_hits, .. } = search.execute().unwrap(); + + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + + let displayed_fields = match index.displayed_fields(&rtxn).unwrap() { + Some(fields) => Cow::Borrowed(fields), + None => Cow::Owned(fields_ids_map.iter().map(|(id, _)| id).collect()), + }; + + let attributes_to_highlight = match search_query.attributes_to_highlight { + Some(fields) => fields.iter().map(ToOwned::to_owned).collect(), + None => HashSet::new(), + }; + + let stop_words = fst::Set::default(); + let highlighter = Highlighter::new(&stop_words); + let mut documents = Vec::new(); + for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() { + let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap(); + highlighter.highlight_record(&mut object, &found_words, &attributes_to_highlight); + documents.push(object); + } + + let processing_time_ms = start.elapsed().as_millis(); + + let result = SearchResult { + hits: documents, + nb_hits, + query: search_query.q.unwrap_or_default(), + offset: search_query.offset.unwrap_or(0), + limit, + processing_time_ms, + }; + + Ok(result) + } + #[inline] pub fn http_payload_size_limit(&self) -> usize { self.options.http_payload_size_limit.get_bytes() as usize @@ -125,3 +217,62 @@ impl Data { &self.api_keys } } + +struct Highlighter<'a, A> { + analyzer: Analyzer<'a, A>, +} + +impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { + fn new(stop_words: &'a fst::Set) -> Self { + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + Self { analyzer } + } + + fn highlight_value(&self, value: Value, words_to_highlight: &HashSet) -> Value { + match value { + Value::Null => Value::Null, + Value::Bool(boolean) => Value::Bool(boolean), + Value::Number(number) => Value::Number(number), + Value::String(old_string) => { + let mut string = String::new(); + let analyzed = self.analyzer.analyze(&old_string); + for (word, token) in analyzed.reconstruct() { + if token.is_word() { + let to_highlight = words_to_highlight.contains(token.text()); + if to_highlight { string.push_str("") } + string.push_str(word); + if to_highlight { string.push_str("") } + } else { + string.push_str(word); + } + } + Value::String(string) + }, + Value::Array(values) => { + Value::Array(values.into_iter() + .map(|v| self.highlight_value(v, words_to_highlight)) + .collect()) + }, + Value::Object(object) => { + Value::Object(object.into_iter() + .map(|(k, v)| (k, self.highlight_value(v, words_to_highlight))) + .collect()) + }, + } + } + + fn highlight_record( + &self, + object: &mut Map, + words_to_highlight: &HashSet, + attributes_to_highlight: &HashSet, + ) { + // TODO do we need to create a string for element that are not and needs to be highlight? + for (key, value) in object.iter_mut() { + if attributes_to_highlight.contains(key) { + let old_value = mem::take(value); + *value = self.highlight_value(old_value, words_to_highlight); + } + } + } +} diff --git a/src/routes/document.rs b/src/routes/document.rs index c24c01af8..6c5f93991 100644 --- a/src/routes/document.rs +++ b/src/routes/document.rs @@ -117,11 +117,12 @@ async fn update_multiple_documents( async fn add_documents_json( data: web::Data, path: web::Path, - params: web::Query, + _params: web::Query, body: Payload, ) -> Result { let addition_result = data .add_documents( + &path.index_uid, IndexDocumentsMethod::UpdateDocuments, UpdateFormat::Json, body diff --git a/src/routes/search.rs b/src/routes/search.rs index 2c510e0d4..967065687 100644 --- a/src/routes/search.rs +++ b/src/routes/search.rs @@ -1,31 +1,31 @@ use actix_web::{get, post, web, HttpResponse}; -use serde::{Deserialize, Serialize}; -use serde_json::Value; +use log::error; use crate::error::ResponseError; use crate::helpers::Authentication; use crate::routes::IndexParam; use crate::Data; +use crate::data::SearchQuery; pub fn services(cfg: &mut web::ServiceConfig) { cfg.service(search_with_post).service(search_with_url_query); } -#[derive(Serialize, Deserialize)] -#[serde(rename_all = "camelCase", deny_unknown_fields)] -pub struct SearchQuery { - q: Option, - offset: Option, - limit: Option, - attributes_to_retrieve: Option, - attributes_to_crop: Option, - crop_length: Option, - attributes_to_highlight: Option, - filters: Option, - matches: Option, - facet_filters: Option, - facets_distribution: Option, -} +//#[derive(Serialize, Deserialize)] +//#[serde(rename_all = "camelCase", deny_unknown_fields)] +//pub struct SearchQuery { + //q: Option, + //offset: Option, + //limit: Option, + //attributes_to_retrieve: Option, + //attributes_to_crop: Option, + //crop_length: Option, + //attributes_to_highlight: Option, + //filters: Option, + //matches: Option, + //facet_filters: Option, + //facets_distribution: Option, +//} #[get("/indexes/{index_uid}/search", wrap = "Authentication::Public")] async fn search_with_url_query( @@ -36,27 +36,21 @@ async fn search_with_url_query( todo!() } -#[derive(Deserialize)] -#[serde(rename_all = "camelCase", deny_unknown_fields)] -pub struct SearchQueryPost { - _q: Option, - _offset: Option, - _limit: Option, - _attributes_to_retrieve: Option>, - _attributes_to_crop: Option>, - _crop_length: Option, - _attributes_to_highlight: Option>, - _filters: Option, - _matches: Option, - _facet_filters: Option, - _facets_distribution: Option>, -} - #[post("/indexes/{index_uid}/search", wrap = "Authentication::Public")] async fn search_with_post( - _data: web::Data, - _path: web::Path, - _params: web::Json, + data: web::Data, + path: web::Path, + params: web::Json, ) -> Result { - todo!() + let search_result = data.search(&path.index_uid, params.into_inner()); + match search_result { + Ok(docs) => { + let docs = serde_json::to_string(&docs).unwrap(); + Ok(HttpResponse::Ok().body(docs)) + } + Err(e) => { + error!("{}", e); + todo!() + } + } }