Introduce a better query and document lexer

This commit is contained in:
Clément Renault 2020-08-15 20:37:13 +02:00
parent 1e358e3ae8
commit 8806fcd545
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
7 changed files with 117 additions and 13 deletions

34
Cargo.lock generated
View File

@ -6,6 +6,15 @@ version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ccc9a9dd069569f212bc4330af9f17c4afb5e8ce185e83dbb14f1349dda18b10" checksum = "ccc9a9dd069569f212bc4330af9f17c4afb5e8ce185e83dbb14f1349dda18b10"
[[package]]
name = "aho-corasick"
version = "0.7.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "043164d8ba5c4c3035fec9bbee8647c0261d788f3474306f93bb65901cae0e86"
dependencies = [
"memchr",
]
[[package]] [[package]]
name = "anyhow" name = "anyhow"
version = "1.0.31" version = "1.0.31"
@ -1002,6 +1011,7 @@ dependencies = [
"askama_warp", "askama_warp",
"astar-iter", "astar-iter",
"bitpacking", "bitpacking",
"bstr",
"byteorder", "byteorder",
"cow-utils", "cow-utils",
"criterion", "criterion",
@ -1028,6 +1038,7 @@ dependencies = [
"structopt", "structopt",
"tempfile", "tempfile",
"tokio", "tokio",
"unicode-linebreak",
"warp", "warp",
] ]
@ -1631,7 +1642,10 @@ version = "1.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c3780fcf44b193bc4d09f36d2a3c87b251da4a046c87795a0d35f4f927ad8e6" checksum = "9c3780fcf44b193bc4d09f36d2a3c87b251da4a046c87795a0d35f4f927ad8e6"
dependencies = [ dependencies = [
"aho-corasick",
"memchr",
"regex-syntax", "regex-syntax",
"thread_local 1.0.1",
] ]
[[package]] [[package]]
@ -1849,7 +1863,7 @@ dependencies = [
"chrono", "chrono",
"log 0.4.8", "log 0.4.8",
"termcolor", "termcolor",
"thread_local", "thread_local 0.3.4",
] ]
[[package]] [[package]]
@ -1964,6 +1978,15 @@ dependencies = [
"unreachable", "unreachable",
] ]
[[package]]
name = "thread_local"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14"
dependencies = [
"lazy_static 1.4.0",
]
[[package]] [[package]]
name = "time" name = "time"
version = "0.1.43" version = "0.1.43"
@ -2128,6 +2151,15 @@ dependencies = [
"matches", "matches",
] ]
[[package]]
name = "unicode-linebreak"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4e30c7c3c3fa01e2c0da7008b57c2e5414b132a27fdf797e49e5ecbfe4f4b150"
dependencies = [
"regex",
]
[[package]] [[package]]
name = "unicode-normalization" name = "unicode-normalization"
version = "0.1.12" version = "0.1.12"

View File

@ -10,6 +10,7 @@ anyhow = "1.0.28"
arc-cache = { git = "https://github.com/Kerollmops/rust-arc-cache.git", rev = "56530f2" } arc-cache = { git = "https://github.com/Kerollmops/rust-arc-cache.git", rev = "56530f2" }
astar-iter = { git = "https://github.com/Kerollmops/astar-iter" } astar-iter = { git = "https://github.com/Kerollmops/astar-iter" }
bitpacking = "0.8.2" bitpacking = "0.8.2"
bstr = "0.2.13"
byteorder = "1.3.4" byteorder = "1.3.4"
cow-utils = "0.1.2" cow-utils = "0.1.2"
csv = "1.1.3" csv = "1.1.3"
@ -29,6 +30,7 @@ smallstr = "0.2.0"
smallvec = "1.4.0" smallvec = "1.4.0"
structopt = { version = "0.3.14", default-features = false } structopt = { version = "0.3.14", default-features = false }
tempfile = "3.1.0" tempfile = "3.1.0"
unicode-linebreak = "0.1.0"
# logging # logging
log = "0.4.8" log = "0.4.8"

View File

@ -9,6 +9,7 @@ use std::time::Instant;
use anyhow::Context; use anyhow::Context;
use arc_cache::ArcCache; use arc_cache::ArcCache;
use bstr::ByteSlice as _;
use cow_utils::CowUtils; use cow_utils::CowUtils;
use fst::IntoStreamer; use fst::IntoStreamer;
use heed::EnvOpenOptions; use heed::EnvOpenOptions;
@ -18,12 +19,11 @@ use memmap::Mmap;
use oxidized_mtbl::{Reader, Writer, Merger, Sorter, CompressionType}; use oxidized_mtbl::{Reader, Writer, Merger, Sorter, CompressionType};
use rayon::prelude::*; use rayon::prelude::*;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use slice_group_by::StrGroupBy;
use structopt::StructOpt; use structopt::StructOpt;
use milli::{SmallVec32, Index, DocumentId, Position, Attribute}; use milli::{lexer, SmallVec32, Index, DocumentId, Position, Attribute};
const LMDB_MAX_KEY_LENGTH: usize = 512; const LMDB_MAX_KEY_LENGTH: usize = 511;
const ONE_MILLION: usize = 1_000_000; const ONE_MILLION: usize = 1_000_000;
const MAX_POSITION: usize = 1000; const MAX_POSITION: usize = 1000;
@ -39,11 +39,6 @@ const WORD_ATTRIBUTE_DOCIDS_BYTE: u8 = 3;
#[global_allocator] #[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
pub fn simple_alphanumeric_tokens(string: &str) -> impl Iterator<Item = &str> {
let is_alphanumeric = |s: &&str| s.chars().next().map_or(false, char::is_alphanumeric);
string.linear_group_by_key(|c| c.is_alphanumeric()).filter(is_alphanumeric)
}
#[derive(Debug, StructOpt)] #[derive(Debug, StructOpt)]
#[structopt(name = "milli-indexer", about = "The indexer binary of the milli project.")] #[structopt(name = "milli-indexer", about = "The indexer binary of the milli project.")]
struct Opt { struct Opt {
@ -345,7 +340,7 @@ where F: FnMut(&[u8], &[u8]) -> anyhow::Result<()>
let mut iter = merger.into_merge_iter()?; let mut iter = merger.into_merge_iter()?;
while let Some(result) = iter.next() { while let Some(result) = iter.next() {
let (k, v) = result?; let (k, v) = result?;
(f)(&k, &v)?; (f)(&k, &v).with_context(|| format!("writing {:?} {:?} into LMDB", k.as_bstr(), k.as_bstr()))?;
} }
debug!("MTBL stores merged in {:.02?}!", before.elapsed()); debug!("MTBL stores merged in {:.02?}!", before.elapsed());
@ -389,7 +384,7 @@ fn index_csv(
} }
for (attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) { for (attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) {
for (pos, word) in simple_alphanumeric_tokens(&content).enumerate().take(MAX_POSITION) { for (pos, word) in lexer::break_string(&content).enumerate().take(MAX_POSITION) {
let word = word.cow_to_lowercase(); let word = word.cow_to_lowercase();
let position = (attr * MAX_POSITION + pos) as u32; let position = (attr * MAX_POSITION + pos) as u32;
store.insert_word_position(&word, position)?; store.insert_word_position(&word, position)?;

44
src/lexer.rs Normal file
View File

@ -0,0 +1,44 @@
use unicode_linebreak::{linebreaks, BreakClass, break_property};
fn can_be_broken(c: char) -> bool {
use BreakClass::*;
match break_property(c as u32) {
Ideographic
| Alphabetic
| Numeric
| CombiningMark
| WordJoiner
| NonBreakingGlue
| OpenPunctuation
| Symbol
| EmojiBase
| EmojiModifier
| HangulLJamo
| HangulVJamo
| HangulTJamo
| RegionalIndicator
| Quotation => false,
_ => true,
}
}
fn extract_token(s: &str) -> &str {
let end = s.char_indices().rev()
.take_while(|(_, c)| can_be_broken(*c))
.last()
.map(|(i, _)| i)
.unwrap_or(s.len());
&s[..end]
}
pub fn break_string(s: &str) -> impl Iterator<Item = &str> {
let mut prev = 0;
linebreaks(&s).map(move |(i, _)| {
let s = &s[prev..i];
prev = i;
extract_token(s)
})
.filter(|s| !s.is_empty())
}

View File

@ -4,6 +4,7 @@ mod node;
mod query_tokens; mod query_tokens;
mod search; mod search;
mod transitive_arc; mod transitive_arc;
pub mod lexer;
use std::collections::HashMap; use std::collections::HashMap;
use std::fs::{File, OpenOptions}; use std::fs::{File, OpenOptions};

View File

@ -1,4 +1,5 @@
use std::{mem, str}; use std::{mem, str};
use unicode_linebreak::{break_property, BreakClass};
use QueryToken::{Quoted, Free}; use QueryToken::{Quoted, Free};
@ -8,6 +9,7 @@ pub enum QueryToken<'a> {
Quoted(&'a str), Quoted(&'a str),
} }
#[derive(Debug)]
enum State { enum State {
Free(usize), Free(usize),
Quoted(usize), Quoted(usize),
@ -67,8 +69,13 @@ impl<'a> Iterator for QueryTokens<'a> {
}, },
State::Fused => return None, State::Fused => return None,
} }
} } else if break_property(c as u32) == BreakClass::Ideographic {
else if !self.state.is_quoted() && !c.is_alphanumeric() { match self.state.replace_by(State::Free(afteri)) {
State::Quoted(s) => return Some(Quoted(&self.string[s..afteri])),
State::Free(s) => return Some(Free(&self.string[s..afteri])),
_ => self.state = State::Free(afteri),
}
} else if !self.state.is_quoted() && !c.is_alphanumeric() {
match self.state.replace_by(State::Free(afteri)) { match self.state.replace_by(State::Free(afteri)) {
State::Free(s) if i > s => return Some(Free(&self.string[s..i])), State::Free(s) if i > s => return Some(Free(&self.string[s..i])),
_ => self.state = State::Free(afteri), _ => self.state = State::Free(afteri),
@ -83,6 +90,15 @@ mod tests {
use super::*; use super::*;
use QueryToken::{Quoted, Free}; use QueryToken::{Quoted, Free};
#[test]
fn empty() {
let mut iter = QueryTokens::new("");
assert_eq!(iter.next(), None);
let mut iter = QueryTokens::new(" ");
assert_eq!(iter.next(), None);
}
#[test] #[test]
fn one_quoted_string() { fn one_quoted_string() {
let mut iter = QueryTokens::new("\"hello\""); let mut iter = QueryTokens::new("\"hello\"");
@ -154,4 +170,14 @@ mod tests {
assert_eq!(iter.next(), Some(Quoted("monde est beau"))); assert_eq!(iter.next(), Some(Quoted("monde est beau")));
assert_eq!(iter.next(), None); assert_eq!(iter.next(), None);
} }
#[test]
fn chinese() {
let mut iter = QueryTokens::new("汽车男生");
assert_eq!(iter.next(), Some(Free("")));
assert_eq!(iter.next(), Some(Free("")));
assert_eq!(iter.next(), Some(Free("")));
assert_eq!(iter.next(), Some(Free("")));
assert_eq!(iter.next(), None);
}
} }

View File

@ -217,6 +217,10 @@ impl<'a> Search<'a> {
None => return Ok(Default::default()), None => return Ok(Default::default()),
}; };
if dfas.is_empty() {
return Ok(Default::default());
}
let (derived_words, union_positions) = Self::fetch_words_positions(rtxn, index, &fst, dfas)?; let (derived_words, union_positions) = Self::fetch_words_positions(rtxn, index, &fst, dfas)?;
let candidates = Self::compute_candidates(rtxn, index, &derived_words)?; let candidates = Self::compute_candidates(rtxn, index, &derived_words)?;