diff --git a/Cargo.lock b/Cargo.lock index c3fb8f29f..fc39b8f47 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6,6 +6,15 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ccc9a9dd069569f212bc4330af9f17c4afb5e8ce185e83dbb14f1349dda18b10" +[[package]] +name = "aho-corasick" +version = "0.7.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "043164d8ba5c4c3035fec9bbee8647c0261d788f3474306f93bb65901cae0e86" +dependencies = [ + "memchr", +] + [[package]] name = "anyhow" version = "1.0.31" @@ -1002,6 +1011,7 @@ dependencies = [ "askama_warp", "astar-iter", "bitpacking", + "bstr", "byteorder", "cow-utils", "criterion", @@ -1028,6 +1038,7 @@ dependencies = [ "structopt", "tempfile", "tokio", + "unicode-linebreak", "warp", ] @@ -1631,7 +1642,10 @@ version = "1.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c3780fcf44b193bc4d09f36d2a3c87b251da4a046c87795a0d35f4f927ad8e6" dependencies = [ + "aho-corasick", + "memchr", "regex-syntax", + "thread_local 1.0.1", ] [[package]] @@ -1849,7 +1863,7 @@ dependencies = [ "chrono", "log 0.4.8", "termcolor", - "thread_local", + "thread_local 0.3.4", ] [[package]] @@ -1964,6 +1978,15 @@ dependencies = [ "unreachable", ] +[[package]] +name = "thread_local" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14" +dependencies = [ + "lazy_static 1.4.0", +] + [[package]] name = "time" version = "0.1.43" @@ -2128,6 +2151,15 @@ dependencies = [ "matches", ] +[[package]] +name = "unicode-linebreak" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e30c7c3c3fa01e2c0da7008b57c2e5414b132a27fdf797e49e5ecbfe4f4b150" +dependencies = [ + "regex", +] + [[package]] name = "unicode-normalization" version = "0.1.12" diff --git a/Cargo.toml b/Cargo.toml index c34dd6511..5880fb4d4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,6 +10,7 @@ anyhow = "1.0.28" arc-cache = { git = "https://github.com/Kerollmops/rust-arc-cache.git", rev = "56530f2" } astar-iter = { git = "https://github.com/Kerollmops/astar-iter" } bitpacking = "0.8.2" +bstr = "0.2.13" byteorder = "1.3.4" cow-utils = "0.1.2" csv = "1.1.3" @@ -29,6 +30,7 @@ smallstr = "0.2.0" smallvec = "1.4.0" structopt = { version = "0.3.14", default-features = false } tempfile = "3.1.0" +unicode-linebreak = "0.1.0" # logging log = "0.4.8" diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index 8bfdeb80e..034fae115 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -9,6 +9,7 @@ use std::time::Instant; use anyhow::Context; use arc_cache::ArcCache; +use bstr::ByteSlice as _; use cow_utils::CowUtils; use fst::IntoStreamer; use heed::EnvOpenOptions; @@ -18,12 +19,11 @@ use memmap::Mmap; use oxidized_mtbl::{Reader, Writer, Merger, Sorter, CompressionType}; use rayon::prelude::*; use roaring::RoaringBitmap; -use slice_group_by::StrGroupBy; use structopt::StructOpt; -use milli::{SmallVec32, Index, DocumentId, Position, Attribute}; +use milli::{lexer, SmallVec32, Index, DocumentId, Position, Attribute}; -const LMDB_MAX_KEY_LENGTH: usize = 512; +const LMDB_MAX_KEY_LENGTH: usize = 511; const ONE_MILLION: usize = 1_000_000; const MAX_POSITION: usize = 1000; @@ -39,11 +39,6 @@ const WORD_ATTRIBUTE_DOCIDS_BYTE: u8 = 3; #[global_allocator] static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; -pub fn simple_alphanumeric_tokens(string: &str) -> impl Iterator { - let is_alphanumeric = |s: &&str| s.chars().next().map_or(false, char::is_alphanumeric); - string.linear_group_by_key(|c| c.is_alphanumeric()).filter(is_alphanumeric) -} - #[derive(Debug, StructOpt)] #[structopt(name = "milli-indexer", about = "The indexer binary of the milli project.")] struct Opt { @@ -345,7 +340,7 @@ where F: FnMut(&[u8], &[u8]) -> anyhow::Result<()> let mut iter = merger.into_merge_iter()?; while let Some(result) = iter.next() { let (k, v) = result?; - (f)(&k, &v)?; + (f)(&k, &v).with_context(|| format!("writing {:?} {:?} into LMDB", k.as_bstr(), k.as_bstr()))?; } debug!("MTBL stores merged in {:.02?}!", before.elapsed()); @@ -389,7 +384,7 @@ fn index_csv( } for (attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) { - for (pos, word) in simple_alphanumeric_tokens(&content).enumerate().take(MAX_POSITION) { + for (pos, word) in lexer::break_string(&content).enumerate().take(MAX_POSITION) { let word = word.cow_to_lowercase(); let position = (attr * MAX_POSITION + pos) as u32; store.insert_word_position(&word, position)?; diff --git a/src/lexer.rs b/src/lexer.rs new file mode 100644 index 000000000..c0910b231 --- /dev/null +++ b/src/lexer.rs @@ -0,0 +1,44 @@ +use unicode_linebreak::{linebreaks, BreakClass, break_property}; + +fn can_be_broken(c: char) -> bool { + use BreakClass::*; + + match break_property(c as u32) { + Ideographic + | Alphabetic + | Numeric + | CombiningMark + | WordJoiner + | NonBreakingGlue + | OpenPunctuation + | Symbol + | EmojiBase + | EmojiModifier + | HangulLJamo + | HangulVJamo + | HangulTJamo + | RegionalIndicator + | Quotation => false, + _ => true, + } +} + +fn extract_token(s: &str) -> &str { + let end = s.char_indices().rev() + .take_while(|(_, c)| can_be_broken(*c)) + .last() + .map(|(i, _)| i) + .unwrap_or(s.len()); + + &s[..end] +} + +pub fn break_string(s: &str) -> impl Iterator { + let mut prev = 0; + linebreaks(&s).map(move |(i, _)| { + let s = &s[prev..i]; + prev = i; + extract_token(s) + }) + .filter(|s| !s.is_empty()) +} diff --git a/src/lib.rs b/src/lib.rs index 533d342ec..1e21331dc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,6 +4,7 @@ mod node; mod query_tokens; mod search; mod transitive_arc; +pub mod lexer; use std::collections::HashMap; use std::fs::{File, OpenOptions}; diff --git a/src/query_tokens.rs b/src/query_tokens.rs index 521942f7b..5ea606ac6 100644 --- a/src/query_tokens.rs +++ b/src/query_tokens.rs @@ -1,4 +1,5 @@ use std::{mem, str}; +use unicode_linebreak::{break_property, BreakClass}; use QueryToken::{Quoted, Free}; @@ -8,6 +9,7 @@ pub enum QueryToken<'a> { Quoted(&'a str), } +#[derive(Debug)] enum State { Free(usize), Quoted(usize), @@ -67,8 +69,13 @@ impl<'a> Iterator for QueryTokens<'a> { }, State::Fused => return None, } - } - else if !self.state.is_quoted() && !c.is_alphanumeric() { + } else if break_property(c as u32) == BreakClass::Ideographic { + match self.state.replace_by(State::Free(afteri)) { + State::Quoted(s) => return Some(Quoted(&self.string[s..afteri])), + State::Free(s) => return Some(Free(&self.string[s..afteri])), + _ => self.state = State::Free(afteri), + } + } else if !self.state.is_quoted() && !c.is_alphanumeric() { match self.state.replace_by(State::Free(afteri)) { State::Free(s) if i > s => return Some(Free(&self.string[s..i])), _ => self.state = State::Free(afteri), @@ -83,6 +90,15 @@ mod tests { use super::*; use QueryToken::{Quoted, Free}; + #[test] + fn empty() { + let mut iter = QueryTokens::new(""); + assert_eq!(iter.next(), None); + + let mut iter = QueryTokens::new(" "); + assert_eq!(iter.next(), None); + } + #[test] fn one_quoted_string() { let mut iter = QueryTokens::new("\"hello\""); @@ -154,4 +170,14 @@ mod tests { assert_eq!(iter.next(), Some(Quoted("monde est beau"))); assert_eq!(iter.next(), None); } + + #[test] + fn chinese() { + let mut iter = QueryTokens::new("汽车男生"); + assert_eq!(iter.next(), Some(Free("汽"))); + assert_eq!(iter.next(), Some(Free("车"))); + assert_eq!(iter.next(), Some(Free("男"))); + assert_eq!(iter.next(), Some(Free("生"))); + assert_eq!(iter.next(), None); + } } diff --git a/src/search.rs b/src/search.rs index 34772dbcb..b55b38904 100644 --- a/src/search.rs +++ b/src/search.rs @@ -217,6 +217,10 @@ impl<'a> Search<'a> { None => return Ok(Default::default()), }; + if dfas.is_empty() { + return Ok(Default::default()); + } + let (derived_words, union_positions) = Self::fetch_words_positions(rtxn, index, &fst, dfas)?; let candidates = Self::compute_candidates(rtxn, index, &derived_words)?;