mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-26 03:55:07 +08:00
Come back to the old tokenizer
This commit is contained in:
parent
220ba0785c
commit
bad0663138
33
Cargo.lock
generated
33
Cargo.lock
generated
@ -6,15 +6,6 @@ version = "0.2.2"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "ccc9a9dd069569f212bc4330af9f17c4afb5e8ce185e83dbb14f1349dda18b10"
|
checksum = "ccc9a9dd069569f212bc4330af9f17c4afb5e8ce185e83dbb14f1349dda18b10"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "aho-corasick"
|
|
||||||
version = "0.7.13"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "043164d8ba5c4c3035fec9bbee8647c0261d788f3474306f93bb65901cae0e86"
|
|
||||||
dependencies = [
|
|
||||||
"memchr",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "anyhow"
|
name = "anyhow"
|
||||||
version = "1.0.31"
|
version = "1.0.31"
|
||||||
@ -1029,7 +1020,6 @@ dependencies = [
|
|||||||
"structopt",
|
"structopt",
|
||||||
"tempfile",
|
"tempfile",
|
||||||
"tokio",
|
"tokio",
|
||||||
"unicode-linebreak",
|
|
||||||
"warp",
|
"warp",
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -1624,10 +1614,7 @@ version = "1.3.9"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "9c3780fcf44b193bc4d09f36d2a3c87b251da4a046c87795a0d35f4f927ad8e6"
|
checksum = "9c3780fcf44b193bc4d09f36d2a3c87b251da4a046c87795a0d35f4f927ad8e6"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"aho-corasick",
|
|
||||||
"memchr",
|
|
||||||
"regex-syntax",
|
"regex-syntax",
|
||||||
"thread_local 1.0.1",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -1851,7 +1838,7 @@ dependencies = [
|
|||||||
"chrono",
|
"chrono",
|
||||||
"log 0.4.8",
|
"log 0.4.8",
|
||||||
"termcolor",
|
"termcolor",
|
||||||
"thread_local 0.3.4",
|
"thread_local",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -1966,15 +1953,6 @@ dependencies = [
|
|||||||
"unreachable",
|
"unreachable",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "thread_local"
|
|
||||||
version = "1.0.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14"
|
|
||||||
dependencies = [
|
|
||||||
"lazy_static 1.4.0",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "time"
|
name = "time"
|
||||||
version = "0.1.43"
|
version = "0.1.43"
|
||||||
@ -2139,15 +2117,6 @@ dependencies = [
|
|||||||
"matches",
|
"matches",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "unicode-linebreak"
|
|
||||||
version = "0.1.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "4e30c7c3c3fa01e2c0da7008b57c2e5414b132a27fdf797e49e5ecbfe4f4b150"
|
|
||||||
dependencies = [
|
|
||||||
"regex",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "unicode-normalization"
|
name = "unicode-normalization"
|
||||||
version = "0.1.12"
|
version = "0.1.12"
|
||||||
|
@ -31,7 +31,6 @@ smallstr = "0.2.0"
|
|||||||
smallvec = "1.4.0"
|
smallvec = "1.4.0"
|
||||||
structopt = { version = "0.3.14", default-features = false }
|
structopt = { version = "0.3.14", default-features = false }
|
||||||
tempfile = "3.1.0"
|
tempfile = "3.1.0"
|
||||||
unicode-linebreak = "0.1.0"
|
|
||||||
|
|
||||||
# logging
|
# logging
|
||||||
log = "0.4.8"
|
log = "0.4.8"
|
||||||
|
@ -21,7 +21,8 @@ use rayon::prelude::*;
|
|||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use structopt::StructOpt;
|
use structopt::StructOpt;
|
||||||
|
|
||||||
use milli::{lexer, SmallVec32, Index, DocumentId, Position, Attribute, BEU32};
|
use milli::{SmallVec32, Index, DocumentId, Position, Attribute, BEU32};
|
||||||
|
use milli::tokenizer::{simple_tokenizer, only_words};
|
||||||
|
|
||||||
const LMDB_MAX_KEY_LENGTH: usize = 511;
|
const LMDB_MAX_KEY_LENGTH: usize = 511;
|
||||||
const ONE_MILLION: usize = 1_000_000;
|
const ONE_MILLION: usize = 1_000_000;
|
||||||
@ -367,7 +368,7 @@ fn merge(key: &[u8], values: &[Vec<u8>]) -> Result<Vec<u8>, ()> {
|
|||||||
WORDS_FST_KEY => {
|
WORDS_FST_KEY => {
|
||||||
let fsts: Vec<_> = values.iter().map(|v| fst::Set::new(v).unwrap()).collect();
|
let fsts: Vec<_> = values.iter().map(|v| fst::Set::new(v).unwrap()).collect();
|
||||||
|
|
||||||
// Union of the two FSTs
|
// Union of the FSTs
|
||||||
let mut op = fst::set::OpBuilder::new();
|
let mut op = fst::set::OpBuilder::new();
|
||||||
fsts.iter().for_each(|fst| op.push(fst.into_stream()));
|
fsts.iter().for_each(|fst| op.push(fst.into_stream()));
|
||||||
let op = op.r#union();
|
let op = op.r#union();
|
||||||
@ -387,15 +388,16 @@ fn merge(key: &[u8], values: &[Vec<u8>]) -> Result<Vec<u8>, ()> {
|
|||||||
| WORD_FOUR_POSITIONS_DOCIDS_BYTE
|
| WORD_FOUR_POSITIONS_DOCIDS_BYTE
|
||||||
| WORD_ATTRIBUTE_DOCIDS_BYTE =>
|
| WORD_ATTRIBUTE_DOCIDS_BYTE =>
|
||||||
{
|
{
|
||||||
let mut first = RoaringBitmap::deserialize_from(values[0].as_slice()).unwrap();
|
let (head, tail) = values.split_first().unwrap();
|
||||||
|
|
||||||
for value in &values[1..] {
|
let mut head = RoaringBitmap::deserialize_from(head.as_slice()).unwrap();
|
||||||
|
for value in tail {
|
||||||
let bitmap = RoaringBitmap::deserialize_from(value.as_slice()).unwrap();
|
let bitmap = RoaringBitmap::deserialize_from(value.as_slice()).unwrap();
|
||||||
first.union_with(&bitmap);
|
head.union_with(&bitmap);
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut vec = Vec::new();
|
let mut vec = Vec::with_capacity(head.serialized_size());
|
||||||
first.serialize_into(&mut vec).unwrap();
|
head.serialize_into(&mut vec).unwrap();
|
||||||
Ok(vec)
|
Ok(vec)
|
||||||
},
|
},
|
||||||
otherwise => panic!("wut {:?}", otherwise),
|
otherwise => panic!("wut {:?}", otherwise),
|
||||||
@ -505,8 +507,8 @@ fn index_csv(
|
|||||||
|
|
||||||
let document_id = DocumentId::try_from(document_id).context("generated id is too big")?;
|
let document_id = DocumentId::try_from(document_id).context("generated id is too big")?;
|
||||||
for (attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) {
|
for (attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) {
|
||||||
for (pos, word) in lexer::break_string(&content).enumerate().take(MAX_POSITION) {
|
for (pos, (_, token)) in simple_tokenizer(&content).filter(only_words).enumerate().take(MAX_POSITION) {
|
||||||
let word = word.cow_to_lowercase();
|
let word = token.cow_to_lowercase();
|
||||||
let position = (attr * MAX_POSITION + pos) as u32;
|
let position = (attr * MAX_POSITION + pos) as u32;
|
||||||
store.insert_word_position_docid(&word, position, document_id)?;
|
store.insert_word_position_docid(&word, position, document_id)?;
|
||||||
}
|
}
|
||||||
|
@ -9,10 +9,10 @@ use std::time::Instant;
|
|||||||
use askama_warp::Template;
|
use askama_warp::Template;
|
||||||
use heed::EnvOpenOptions;
|
use heed::EnvOpenOptions;
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
use slice_group_by::StrGroupBy;
|
|
||||||
use structopt::StructOpt;
|
use structopt::StructOpt;
|
||||||
use warp::{Filter, http::Response};
|
use warp::{Filter, http::Response};
|
||||||
|
|
||||||
|
use milli::tokenizer::{simple_tokenizer, TokenType};
|
||||||
use milli::{Index, SearchResult};
|
use milli::{Index, SearchResult};
|
||||||
|
|
||||||
#[cfg(target_os = "linux")]
|
#[cfg(target_os = "linux")]
|
||||||
@ -47,12 +47,16 @@ struct Opt {
|
|||||||
|
|
||||||
fn highlight_string(string: &str, words: &HashSet<String>) -> String {
|
fn highlight_string(string: &str, words: &HashSet<String>) -> String {
|
||||||
let mut output = String::new();
|
let mut output = String::new();
|
||||||
for token in string.linear_group_by_key(|c| c.is_alphanumeric()) {
|
for (token_type, token) in simple_tokenizer(string) {
|
||||||
let lowercase_token = token.to_lowercase();
|
if token_type == TokenType::Word {
|
||||||
let to_highlight = words.contains(&lowercase_token);
|
let lowercase_token = token.to_lowercase();
|
||||||
if to_highlight { output.push_str("<mark>") }
|
let to_highlight = words.contains(&lowercase_token);
|
||||||
output.push_str(token);
|
if to_highlight { output.push_str("<mark>") }
|
||||||
if to_highlight { output.push_str("</mark>") }
|
output.push_str(token);
|
||||||
|
if to_highlight { output.push_str("</mark>") }
|
||||||
|
} else {
|
||||||
|
output.push_str(token);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
output
|
output
|
||||||
}
|
}
|
||||||
|
44
src/lexer.rs
44
src/lexer.rs
@ -1,44 +0,0 @@
|
|||||||
use unicode_linebreak::{linebreaks, BreakClass, break_property};
|
|
||||||
|
|
||||||
fn can_be_broken(c: char) -> bool {
|
|
||||||
use BreakClass::*;
|
|
||||||
|
|
||||||
match break_property(c as u32) {
|
|
||||||
Ideographic
|
|
||||||
| Alphabetic
|
|
||||||
| Numeric
|
|
||||||
| CombiningMark
|
|
||||||
| WordJoiner
|
|
||||||
| NonBreakingGlue
|
|
||||||
| OpenPunctuation
|
|
||||||
| Symbol
|
|
||||||
| EmojiBase
|
|
||||||
| EmojiModifier
|
|
||||||
| HangulLJamo
|
|
||||||
| HangulVJamo
|
|
||||||
| HangulTJamo
|
|
||||||
| RegionalIndicator
|
|
||||||
| Quotation => false,
|
|
||||||
_ => true,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn extract_token(s: &str) -> &str {
|
|
||||||
let end = s.char_indices().rev()
|
|
||||||
.take_while(|(_, c)| can_be_broken(*c))
|
|
||||||
.last()
|
|
||||||
.map(|(i, _)| i)
|
|
||||||
.unwrap_or(s.len());
|
|
||||||
|
|
||||||
&s[..end]
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn break_string(s: &str) -> impl Iterator<Item = &str> {
|
|
||||||
let mut prev = 0;
|
|
||||||
linebreaks(&s).map(move |(i, _)| {
|
|
||||||
let s = &s[prev..i];
|
|
||||||
prev = i;
|
|
||||||
extract_token(s)
|
|
||||||
})
|
|
||||||
.filter(|s| !s.is_empty())
|
|
||||||
}
|
|
@ -3,7 +3,7 @@ mod node;
|
|||||||
mod query_tokens;
|
mod query_tokens;
|
||||||
mod search;
|
mod search;
|
||||||
pub mod heed_codec;
|
pub mod heed_codec;
|
||||||
pub mod lexer;
|
pub mod tokenizer;
|
||||||
|
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::hash::BuildHasherDefault;
|
use std::hash::BuildHasherDefault;
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
use std::{mem, str};
|
use std::{mem, str};
|
||||||
use unicode_linebreak::{break_property, BreakClass};
|
|
||||||
|
|
||||||
use QueryToken::{Quoted, Free};
|
use QueryToken::{Quoted, Free};
|
||||||
|
|
||||||
@ -69,12 +68,6 @@ impl<'a> Iterator for QueryTokens<'a> {
|
|||||||
},
|
},
|
||||||
State::Fused => return None,
|
State::Fused => return None,
|
||||||
}
|
}
|
||||||
} else if break_property(c as u32) == BreakClass::Ideographic {
|
|
||||||
match self.state.replace_by(State::Free(afteri)) {
|
|
||||||
State::Quoted(s) => return Some(Quoted(&self.string[s..afteri])),
|
|
||||||
State::Free(s) => return Some(Free(&self.string[s..afteri])),
|
|
||||||
_ => self.state = State::Free(afteri),
|
|
||||||
}
|
|
||||||
} else if !self.state.is_quoted() && !c.is_alphanumeric() {
|
} else if !self.state.is_quoted() && !c.is_alphanumeric() {
|
||||||
match self.state.replace_by(State::Free(afteri)) {
|
match self.state.replace_by(State::Free(afteri)) {
|
||||||
State::Free(s) if i > s => return Some(Free(&self.string[s..i])),
|
State::Free(s) if i > s => return Some(Free(&self.string[s..i])),
|
||||||
|
21
src/tokenizer.rs
Normal file
21
src/tokenizer.rs
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
use slice_group_by::StrGroupBy;
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub enum TokenType {
|
||||||
|
Word,
|
||||||
|
Space,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn simple_tokenizer(text: &str) -> impl Iterator<Item=(TokenType, &str)> {
|
||||||
|
text
|
||||||
|
.linear_group_by_key(|c| c.is_alphanumeric())
|
||||||
|
.map(|s| {
|
||||||
|
let first = s.chars().next().unwrap();
|
||||||
|
let type_ = if first.is_alphanumeric() { TokenType::Word } else { TokenType::Space };
|
||||||
|
(type_, s)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn only_words((t, _): &(TokenType, &str)) -> bool {
|
||||||
|
*t == TokenType::Word
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user