Take stop word in account

This commit is contained in:
many 2021-08-17 12:25:07 +02:00
parent 823da19745
commit 2d1727697d
No known key found for this signature in database
GPG Key ID: 2CEF23B75189EACA
3 changed files with 11 additions and 0 deletions

View File

@ -21,6 +21,7 @@ pub fn extract_docid_word_positions<R: io::Read>(
mut obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters,
searchable_fields: &Option<HashSet<FieldId>>,
stop_words: Option<&fst::Set<&[u8]>>,
) -> Result<(RoaringBitmap, grenad::Reader<File>)> {
let max_memory = indexer.max_memory_by_thread();
@ -35,6 +36,10 @@ pub fn extract_docid_word_positions<R: io::Read>(
let mut key_buffer = Vec::new();
let mut field_buffer = String::new();
let mut config = AnalyzerConfig::default();
if let Some(stop_words) = stop_words {
config.stop_words(stop_words);
}
let analyzer = Analyzer::<Vec<u8>>::new(AnalyzerConfig::default());
while let Some((key, value)) = obkv_documents.next()? {

View File

@ -37,6 +37,7 @@ pub(crate) fn data_from_obkv_documents(
lmdb_writer_sx: Sender<TypedChunk>,
searchable_fields: Option<HashSet<FieldId>>,
faceted_fields: HashSet<FieldId>,
stop_words: Option<fst::Set<&[u8]>>,
) -> Result<()> {
let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = obkv_chunks
.par_bridge()
@ -54,6 +55,7 @@ pub(crate) fn data_from_obkv_documents(
documents_chunk.clone(),
indexer.clone(),
&searchable_fields,
stop_words.as_ref(),
)?;
// send documents_ids to DB writer

View File

@ -231,6 +231,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
// get filterable fields for facet databases
let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
let stop_words = self.index.stop_words(self.wtxn)?;
// let stop_words = stop_words.as_ref();
// Run extraction pipeline in parallel.
pool.install(|| {
let params = GrenadParameters {
@ -255,6 +258,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
lmdb_writer_sx,
searchable_fields,
faceted_fields,
stop_words,
)
.unwrap();
});