mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-26 03:55:07 +08:00
Introduce a better query and document lexer
This commit is contained in:
parent
1e358e3ae8
commit
8806fcd545
34
Cargo.lock
generated
34
Cargo.lock
generated
@ -6,6 +6,15 @@ version = "0.2.2"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "ccc9a9dd069569f212bc4330af9f17c4afb5e8ce185e83dbb14f1349dda18b10"
|
checksum = "ccc9a9dd069569f212bc4330af9f17c4afb5e8ce185e83dbb14f1349dda18b10"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "aho-corasick"
|
||||||
|
version = "0.7.13"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "043164d8ba5c4c3035fec9bbee8647c0261d788f3474306f93bb65901cae0e86"
|
||||||
|
dependencies = [
|
||||||
|
"memchr",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "anyhow"
|
name = "anyhow"
|
||||||
version = "1.0.31"
|
version = "1.0.31"
|
||||||
@ -1002,6 +1011,7 @@ dependencies = [
|
|||||||
"askama_warp",
|
"askama_warp",
|
||||||
"astar-iter",
|
"astar-iter",
|
||||||
"bitpacking",
|
"bitpacking",
|
||||||
|
"bstr",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
"cow-utils",
|
"cow-utils",
|
||||||
"criterion",
|
"criterion",
|
||||||
@ -1028,6 +1038,7 @@ dependencies = [
|
|||||||
"structopt",
|
"structopt",
|
||||||
"tempfile",
|
"tempfile",
|
||||||
"tokio",
|
"tokio",
|
||||||
|
"unicode-linebreak",
|
||||||
"warp",
|
"warp",
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -1631,7 +1642,10 @@ version = "1.3.9"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "9c3780fcf44b193bc4d09f36d2a3c87b251da4a046c87795a0d35f4f927ad8e6"
|
checksum = "9c3780fcf44b193bc4d09f36d2a3c87b251da4a046c87795a0d35f4f927ad8e6"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"aho-corasick",
|
||||||
|
"memchr",
|
||||||
"regex-syntax",
|
"regex-syntax",
|
||||||
|
"thread_local 1.0.1",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -1849,7 +1863,7 @@ dependencies = [
|
|||||||
"chrono",
|
"chrono",
|
||||||
"log 0.4.8",
|
"log 0.4.8",
|
||||||
"termcolor",
|
"termcolor",
|
||||||
"thread_local",
|
"thread_local 0.3.4",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -1964,6 +1978,15 @@ dependencies = [
|
|||||||
"unreachable",
|
"unreachable",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "thread_local"
|
||||||
|
version = "1.0.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14"
|
||||||
|
dependencies = [
|
||||||
|
"lazy_static 1.4.0",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "time"
|
name = "time"
|
||||||
version = "0.1.43"
|
version = "0.1.43"
|
||||||
@ -2128,6 +2151,15 @@ dependencies = [
|
|||||||
"matches",
|
"matches",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "unicode-linebreak"
|
||||||
|
version = "0.1.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "4e30c7c3c3fa01e2c0da7008b57c2e5414b132a27fdf797e49e5ecbfe4f4b150"
|
||||||
|
dependencies = [
|
||||||
|
"regex",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "unicode-normalization"
|
name = "unicode-normalization"
|
||||||
version = "0.1.12"
|
version = "0.1.12"
|
||||||
|
@ -10,6 +10,7 @@ anyhow = "1.0.28"
|
|||||||
arc-cache = { git = "https://github.com/Kerollmops/rust-arc-cache.git", rev = "56530f2" }
|
arc-cache = { git = "https://github.com/Kerollmops/rust-arc-cache.git", rev = "56530f2" }
|
||||||
astar-iter = { git = "https://github.com/Kerollmops/astar-iter" }
|
astar-iter = { git = "https://github.com/Kerollmops/astar-iter" }
|
||||||
bitpacking = "0.8.2"
|
bitpacking = "0.8.2"
|
||||||
|
bstr = "0.2.13"
|
||||||
byteorder = "1.3.4"
|
byteorder = "1.3.4"
|
||||||
cow-utils = "0.1.2"
|
cow-utils = "0.1.2"
|
||||||
csv = "1.1.3"
|
csv = "1.1.3"
|
||||||
@ -29,6 +30,7 @@ smallstr = "0.2.0"
|
|||||||
smallvec = "1.4.0"
|
smallvec = "1.4.0"
|
||||||
structopt = { version = "0.3.14", default-features = false }
|
structopt = { version = "0.3.14", default-features = false }
|
||||||
tempfile = "3.1.0"
|
tempfile = "3.1.0"
|
||||||
|
unicode-linebreak = "0.1.0"
|
||||||
|
|
||||||
# logging
|
# logging
|
||||||
log = "0.4.8"
|
log = "0.4.8"
|
||||||
|
@ -9,6 +9,7 @@ use std::time::Instant;
|
|||||||
|
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use arc_cache::ArcCache;
|
use arc_cache::ArcCache;
|
||||||
|
use bstr::ByteSlice as _;
|
||||||
use cow_utils::CowUtils;
|
use cow_utils::CowUtils;
|
||||||
use fst::IntoStreamer;
|
use fst::IntoStreamer;
|
||||||
use heed::EnvOpenOptions;
|
use heed::EnvOpenOptions;
|
||||||
@ -18,12 +19,11 @@ use memmap::Mmap;
|
|||||||
use oxidized_mtbl::{Reader, Writer, Merger, Sorter, CompressionType};
|
use oxidized_mtbl::{Reader, Writer, Merger, Sorter, CompressionType};
|
||||||
use rayon::prelude::*;
|
use rayon::prelude::*;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use slice_group_by::StrGroupBy;
|
|
||||||
use structopt::StructOpt;
|
use structopt::StructOpt;
|
||||||
|
|
||||||
use milli::{SmallVec32, Index, DocumentId, Position, Attribute};
|
use milli::{lexer, SmallVec32, Index, DocumentId, Position, Attribute};
|
||||||
|
|
||||||
const LMDB_MAX_KEY_LENGTH: usize = 512;
|
const LMDB_MAX_KEY_LENGTH: usize = 511;
|
||||||
const ONE_MILLION: usize = 1_000_000;
|
const ONE_MILLION: usize = 1_000_000;
|
||||||
|
|
||||||
const MAX_POSITION: usize = 1000;
|
const MAX_POSITION: usize = 1000;
|
||||||
@ -39,11 +39,6 @@ const WORD_ATTRIBUTE_DOCIDS_BYTE: u8 = 3;
|
|||||||
#[global_allocator]
|
#[global_allocator]
|
||||||
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
||||||
|
|
||||||
pub fn simple_alphanumeric_tokens(string: &str) -> impl Iterator<Item = &str> {
|
|
||||||
let is_alphanumeric = |s: &&str| s.chars().next().map_or(false, char::is_alphanumeric);
|
|
||||||
string.linear_group_by_key(|c| c.is_alphanumeric()).filter(is_alphanumeric)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, StructOpt)]
|
#[derive(Debug, StructOpt)]
|
||||||
#[structopt(name = "milli-indexer", about = "The indexer binary of the milli project.")]
|
#[structopt(name = "milli-indexer", about = "The indexer binary of the milli project.")]
|
||||||
struct Opt {
|
struct Opt {
|
||||||
@ -345,7 +340,7 @@ where F: FnMut(&[u8], &[u8]) -> anyhow::Result<()>
|
|||||||
let mut iter = merger.into_merge_iter()?;
|
let mut iter = merger.into_merge_iter()?;
|
||||||
while let Some(result) = iter.next() {
|
while let Some(result) = iter.next() {
|
||||||
let (k, v) = result?;
|
let (k, v) = result?;
|
||||||
(f)(&k, &v)?;
|
(f)(&k, &v).with_context(|| format!("writing {:?} {:?} into LMDB", k.as_bstr(), k.as_bstr()))?;
|
||||||
}
|
}
|
||||||
|
|
||||||
debug!("MTBL stores merged in {:.02?}!", before.elapsed());
|
debug!("MTBL stores merged in {:.02?}!", before.elapsed());
|
||||||
@ -389,7 +384,7 @@ fn index_csv(
|
|||||||
}
|
}
|
||||||
|
|
||||||
for (attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) {
|
for (attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) {
|
||||||
for (pos, word) in simple_alphanumeric_tokens(&content).enumerate().take(MAX_POSITION) {
|
for (pos, word) in lexer::break_string(&content).enumerate().take(MAX_POSITION) {
|
||||||
let word = word.cow_to_lowercase();
|
let word = word.cow_to_lowercase();
|
||||||
let position = (attr * MAX_POSITION + pos) as u32;
|
let position = (attr * MAX_POSITION + pos) as u32;
|
||||||
store.insert_word_position(&word, position)?;
|
store.insert_word_position(&word, position)?;
|
||||||
|
44
src/lexer.rs
Normal file
44
src/lexer.rs
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
use unicode_linebreak::{linebreaks, BreakClass, break_property};
|
||||||
|
|
||||||
|
fn can_be_broken(c: char) -> bool {
|
||||||
|
use BreakClass::*;
|
||||||
|
|
||||||
|
match break_property(c as u32) {
|
||||||
|
Ideographic
|
||||||
|
| Alphabetic
|
||||||
|
| Numeric
|
||||||
|
| CombiningMark
|
||||||
|
| WordJoiner
|
||||||
|
| NonBreakingGlue
|
||||||
|
| OpenPunctuation
|
||||||
|
| Symbol
|
||||||
|
| EmojiBase
|
||||||
|
| EmojiModifier
|
||||||
|
| HangulLJamo
|
||||||
|
| HangulVJamo
|
||||||
|
| HangulTJamo
|
||||||
|
| RegionalIndicator
|
||||||
|
| Quotation => false,
|
||||||
|
_ => true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extract_token(s: &str) -> &str {
|
||||||
|
let end = s.char_indices().rev()
|
||||||
|
.take_while(|(_, c)| can_be_broken(*c))
|
||||||
|
.last()
|
||||||
|
.map(|(i, _)| i)
|
||||||
|
.unwrap_or(s.len());
|
||||||
|
|
||||||
|
&s[..end]
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn break_string(s: &str) -> impl Iterator<Item = &str> {
|
||||||
|
let mut prev = 0;
|
||||||
|
linebreaks(&s).map(move |(i, _)| {
|
||||||
|
let s = &s[prev..i];
|
||||||
|
prev = i;
|
||||||
|
extract_token(s)
|
||||||
|
})
|
||||||
|
.filter(|s| !s.is_empty())
|
||||||
|
}
|
@ -4,6 +4,7 @@ mod node;
|
|||||||
mod query_tokens;
|
mod query_tokens;
|
||||||
mod search;
|
mod search;
|
||||||
mod transitive_arc;
|
mod transitive_arc;
|
||||||
|
pub mod lexer;
|
||||||
|
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::fs::{File, OpenOptions};
|
use std::fs::{File, OpenOptions};
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
use std::{mem, str};
|
use std::{mem, str};
|
||||||
|
use unicode_linebreak::{break_property, BreakClass};
|
||||||
|
|
||||||
use QueryToken::{Quoted, Free};
|
use QueryToken::{Quoted, Free};
|
||||||
|
|
||||||
@ -8,6 +9,7 @@ pub enum QueryToken<'a> {
|
|||||||
Quoted(&'a str),
|
Quoted(&'a str),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
enum State {
|
enum State {
|
||||||
Free(usize),
|
Free(usize),
|
||||||
Quoted(usize),
|
Quoted(usize),
|
||||||
@ -67,8 +69,13 @@ impl<'a> Iterator for QueryTokens<'a> {
|
|||||||
},
|
},
|
||||||
State::Fused => return None,
|
State::Fused => return None,
|
||||||
}
|
}
|
||||||
|
} else if break_property(c as u32) == BreakClass::Ideographic {
|
||||||
|
match self.state.replace_by(State::Free(afteri)) {
|
||||||
|
State::Quoted(s) => return Some(Quoted(&self.string[s..afteri])),
|
||||||
|
State::Free(s) => return Some(Free(&self.string[s..afteri])),
|
||||||
|
_ => self.state = State::Free(afteri),
|
||||||
}
|
}
|
||||||
else if !self.state.is_quoted() && !c.is_alphanumeric() {
|
} else if !self.state.is_quoted() && !c.is_alphanumeric() {
|
||||||
match self.state.replace_by(State::Free(afteri)) {
|
match self.state.replace_by(State::Free(afteri)) {
|
||||||
State::Free(s) if i > s => return Some(Free(&self.string[s..i])),
|
State::Free(s) if i > s => return Some(Free(&self.string[s..i])),
|
||||||
_ => self.state = State::Free(afteri),
|
_ => self.state = State::Free(afteri),
|
||||||
@ -83,6 +90,15 @@ mod tests {
|
|||||||
use super::*;
|
use super::*;
|
||||||
use QueryToken::{Quoted, Free};
|
use QueryToken::{Quoted, Free};
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn empty() {
|
||||||
|
let mut iter = QueryTokens::new("");
|
||||||
|
assert_eq!(iter.next(), None);
|
||||||
|
|
||||||
|
let mut iter = QueryTokens::new(" ");
|
||||||
|
assert_eq!(iter.next(), None);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn one_quoted_string() {
|
fn one_quoted_string() {
|
||||||
let mut iter = QueryTokens::new("\"hello\"");
|
let mut iter = QueryTokens::new("\"hello\"");
|
||||||
@ -154,4 +170,14 @@ mod tests {
|
|||||||
assert_eq!(iter.next(), Some(Quoted("monde est beau")));
|
assert_eq!(iter.next(), Some(Quoted("monde est beau")));
|
||||||
assert_eq!(iter.next(), None);
|
assert_eq!(iter.next(), None);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn chinese() {
|
||||||
|
let mut iter = QueryTokens::new("汽车男生");
|
||||||
|
assert_eq!(iter.next(), Some(Free("汽")));
|
||||||
|
assert_eq!(iter.next(), Some(Free("车")));
|
||||||
|
assert_eq!(iter.next(), Some(Free("男")));
|
||||||
|
assert_eq!(iter.next(), Some(Free("生")));
|
||||||
|
assert_eq!(iter.next(), None);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -217,6 +217,10 @@ impl<'a> Search<'a> {
|
|||||||
None => return Ok(Default::default()),
|
None => return Ok(Default::default()),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if dfas.is_empty() {
|
||||||
|
return Ok(Default::default());
|
||||||
|
}
|
||||||
|
|
||||||
let (derived_words, union_positions) = Self::fetch_words_positions(rtxn, index, &fst, dfas)?;
|
let (derived_words, union_positions) = Self::fetch_words_positions(rtxn, index, &fst, dfas)?;
|
||||||
let candidates = Self::compute_candidates(rtxn, index, &derived_words)?;
|
let candidates = Self::compute_candidates(rtxn, index, &derived_words)?;
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user