Fix fst builder when their is no previous FST

This commit is contained in:
ManyTheFish 2024-09-25 16:52:43 +02:00
parent 3d244451df
commit 960060ebdf

View File

@ -16,6 +16,9 @@ pub struct WordFstBuilder<'a> {
current_prefix: Vec<SmallString32>, current_prefix: Vec<SmallString32>,
current_prefix_count: Vec<u64>, current_prefix_count: Vec<u64>,
prefix_count_threshold: u64, prefix_count_threshold: u64,
inserted_words: usize,
registered_words: usize,
base_set_length: usize,
} }
impl<'a> WordFstBuilder<'a> { impl<'a> WordFstBuilder<'a> {
@ -37,10 +40,17 @@ impl<'a> WordFstBuilder<'a> {
current_prefix: vec![SmallString32::new(); max_prefix_length], current_prefix: vec![SmallString32::new(); max_prefix_length],
current_prefix_count: vec![0; max_prefix_length], current_prefix_count: vec![0; max_prefix_length],
prefix_count_threshold: 100, prefix_count_threshold: 100,
inserted_words: 0,
registered_words: 0,
base_set_length: words_fst.len(),
}) })
} }
pub fn register_word(&mut self, deladd: DelAdd, right: &[u8]) -> Result<()> { pub fn register_word(&mut self, deladd: DelAdd, right: &[u8]) -> Result<()> {
if deladd == DelAdd::Addition {
self.registered_words += 1;
}
if let Some(left) = self.last_word.take() { if let Some(left) = self.last_word.take() {
let (left_inserted, right_inserted) = let (left_inserted, right_inserted) =
self.compare_and_insert(deladd, left.as_slice(), right)?; self.compare_and_insert(deladd, left.as_slice(), right)?;
@ -68,10 +78,15 @@ impl<'a> WordFstBuilder<'a> {
// right was inserted, so we can stop // right was inserted, so we can stop
if right_inserted { if right_inserted {
break; self.stream = Some(stream);
return Ok(());
} }
} }
// If we reach this point, it means that the stream is empty
// and we need to insert the incoming word
self.insert_word(right)?;
self.stream = Some(stream); self.stream = Some(stream);
} }
@ -118,6 +133,7 @@ impl<'a> WordFstBuilder<'a> {
} }
fn insert_word(&mut self, bytes: &[u8]) -> Result<()> { fn insert_word(&mut self, bytes: &[u8]) -> Result<()> {
self.inserted_words += 1;
self.word_fst_builder.insert(bytes)?; self.word_fst_builder.insert(bytes)?;
for n in 0..self.max_prefix_length { for n in 0..self.max_prefix_length {
@ -182,6 +198,13 @@ impl<'a> WordFstBuilder<'a> {
let prefix_fst_file = builder.into_inner()?.into_inner().unwrap(); let prefix_fst_file = builder.into_inner()?.into_inner().unwrap();
let prefix_fst_mmap = unsafe { Mmap::map(&prefix_fst_file)? }; let prefix_fst_mmap = unsafe { Mmap::map(&prefix_fst_file)? };
eprintln!("================================================");
eprintln!(
"inserted words: {}, registered words: {}, base set len: {}",
self.inserted_words, self.registered_words, self.base_set_length
);
eprintln!("================================================");
Ok((words_fst_mmap, prefix_fst_mmap)) Ok((words_fst_mmap, prefix_fst_mmap))
} }
} }