mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-30 17:14:59 +08:00
45 lines
977 B
Rust
45 lines
977 B
Rust
use unicode_linebreak::{linebreaks, BreakClass, break_property};
|
|
|
|
fn can_be_broken(c: char) -> bool {
|
|
use BreakClass::*;
|
|
|
|
match break_property(c as u32) {
|
|
Ideographic
|
|
| Alphabetic
|
|
| Numeric
|
|
| CombiningMark
|
|
| WordJoiner
|
|
| NonBreakingGlue
|
|
| OpenPunctuation
|
|
| Symbol
|
|
| EmojiBase
|
|
| EmojiModifier
|
|
| HangulLJamo
|
|
| HangulVJamo
|
|
| HangulTJamo
|
|
| RegionalIndicator
|
|
| Quotation => false,
|
|
_ => true,
|
|
}
|
|
}
|
|
|
|
fn extract_token(s: &str) -> &str {
|
|
let end = s.char_indices().rev()
|
|
.take_while(|(_, c)| can_be_broken(*c))
|
|
.last()
|
|
.map(|(i, _)| i)
|
|
.unwrap_or(s.len());
|
|
|
|
&s[..end]
|
|
}
|
|
|
|
pub fn break_string(s: &str) -> impl Iterator<Item = &str> {
|
|
let mut prev = 0;
|
|
linebreaks(&s).map(move |(i, _)| {
|
|
let s = &s[prev..i];
|
|
prev = i;
|
|
extract_token(s)
|
|
})
|
|
.filter(|s| !s.is_empty())
|
|
}
|