579: Stop reindexing already indexed documents r=ManyTheFish a=irevoire

```
 % ./compare.sh indexing_stop-reindexing-unchanged-documents_cb5a1669.json indexing_main_eeba1960.json
group                                                                     indexing_main_eeba1960                 indexing_stop-reindexing-unchanged-documents_cb5a1669
-----                                                                     ----------------------                 -----------------------------------------------------
indexing/-geo-delete-facetedNumber-facetedGeo-searchable-                 1.03      2.0±0.22ms        ? ?/sec    1.00  1955.4±336.24µs        ? ?/sec
indexing/-movies-delete-facetedString-facetedNumber-searchable-           1.08     11.0±2.93ms        ? ?/sec    1.00     10.2±4.04ms        ? ?/sec
indexing/-movies-delete-facetedString-facetedNumber-searchable-nested-    1.00     15.1±3.89ms        ? ?/sec    1.14     17.1±5.18ms        ? ?/sec
indexing/-songs-delete-facetedString-facetedNumber-searchable-            1.26    59.2±12.01ms        ? ?/sec    1.00     47.1±8.52ms        ? ?/sec
indexing/-wiki-delete-searchable-                                         1.08   316.6±31.53ms        ? ?/sec    1.00   293.6±17.00ms        ? ?/sec
indexing/Indexing geo_point                                               1.01      60.9±0.31s        ? ?/sec    1.00      60.6±0.36s        ? ?/sec
indexing/Indexing movies in three batches                                 1.04      20.0±0.30s        ? ?/sec    1.00      19.2±0.25s        ? ?/sec
indexing/Indexing movies with default settings                            1.02      19.1±0.18s        ? ?/sec    1.00      18.7±0.24s        ? ?/sec
indexing/Indexing nested movies with default settings                     1.02      26.2±0.29s        ? ?/sec    1.00      25.9±0.22s        ? ?/sec
indexing/Indexing nested movies without any facets                        1.02      25.3±0.32s        ? ?/sec    1.00      24.7±0.26s        ? ?/sec
indexing/Indexing songs in three batches with default settings            1.00      66.7±0.41s        ? ?/sec    1.01      67.1±0.86s        ? ?/sec
indexing/Indexing songs with default settings                             1.00      58.3±0.90s        ? ?/sec    1.01      58.8±1.32s        ? ?/sec
indexing/Indexing songs without any facets                                1.00      54.5±1.43s        ? ?/sec    1.01      55.2±1.29s        ? ?/sec
indexing/Indexing songs without faceted numbers                           1.00      57.9±1.20s        ? ?/sec    1.01      58.4±0.93s        ? ?/sec
indexing/Indexing wiki                                                    1.00   1052.0±10.95s        ? ?/sec    1.02   1069.4±20.38s        ? ?/sec
indexing/Indexing wiki in three batches                                   1.00    1193.1±8.83s        ? ?/sec    1.00    1189.5±9.40s        ? ?/sec
indexing/Reindexing geo_point                                             3.22      67.5±0.73s        ? ?/sec    1.00      21.0±0.16s        ? ?/sec
indexing/Reindexing movies with default settings                          3.75      19.4±0.28s        ? ?/sec    1.00       5.2±0.05s        ? ?/sec
indexing/Reindexing songs with default settings                           8.90      61.4±0.91s        ? ?/sec    1.00       6.9±0.07s        ? ?/sec
indexing/Reindexing wiki                                                  1.00   1748.2±35.68s        ? ?/sec    1.00   1750.5±18.53s        ? ?/sec
```

tldr: We do not lose any performance on the normal indexing benchmark, but we get between 3 and 8 times faster on the reindexing benchmarks 👍 

Co-authored-by: Tamo <tamo@meilisearch.com>
This commit is contained in:
bors[bot] 2022-08-04 08:10:37 +00:00 committed by GitHub
commit 50f6524ff2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -200,7 +200,8 @@ impl<'a, 'i> Transform<'a, 'i> {
let mut original_docid = None; let mut original_docid = None;
let docid = match self.new_external_documents_ids_builder.entry(external_id.into()) { let docid =
match self.new_external_documents_ids_builder.entry(external_id.clone().into()) {
Entry::Occupied(entry) => *entry.get() as u32, Entry::Occupied(entry) => *entry.get() as u32,
Entry::Vacant(entry) => { Entry::Vacant(entry) => {
// If the document was already in the db we mark it as a replaced document. // If the document was already in the db we mark it as a replaced document.
@ -218,6 +219,7 @@ impl<'a, 'i> Transform<'a, 'i> {
} }
}; };
let mut skip_insertion = false;
if let Some(original_docid) = original_docid { if let Some(original_docid) = original_docid {
let original_key = BEU32::new(original_docid); let original_key = BEU32::new(original_docid);
let base_obkv = self let base_obkv = self
@ -230,24 +232,39 @@ impl<'a, 'i> Transform<'a, 'i> {
key: None, key: None,
})?; })?;
// we check if the two documents are exactly equal. If it's the case we can skip this document entirely
if base_obkv == obkv_buffer {
// we're not replacing anything
self.replaced_documents_ids.remove(original_docid);
// and we need to put back the original id as it was before
self.new_external_documents_ids_builder.remove(&*external_id);
skip_insertion = true;
} else {
// we associate the base document with the new key, everything will get merged later. // we associate the base document with the new key, everything will get merged later.
self.original_sorter.insert(&docid.to_be_bytes(), base_obkv)?; self.original_sorter.insert(&docid.to_be_bytes(), base_obkv)?;
match self.flatten_from_fields_ids_map(KvReader::new(&base_obkv))? { match self.flatten_from_fields_ids_map(KvReader::new(&base_obkv))? {
Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?, Some(buffer) => {
self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?
}
None => self.flattened_sorter.insert(docid.to_be_bytes(), base_obkv)?, None => self.flattened_sorter.insert(docid.to_be_bytes(), base_obkv)?,
} }
}
} else { } else {
self.new_documents_ids.insert(docid); self.new_documents_ids.insert(docid);
} }
if !skip_insertion {
// We use the extracted/generated user id as the key for this document. // We use the extracted/generated user id as the key for this document.
self.original_sorter.insert(&docid.to_be_bytes(), &obkv_buffer)?; self.original_sorter.insert(&docid.to_be_bytes(), obkv_buffer.clone())?;
documents_count += 1;
match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? { match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? {
Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?, Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?,
None => self.flattened_sorter.insert(docid.to_be_bytes(), &obkv_buffer)?, None => {
self.flattened_sorter.insert(docid.to_be_bytes(), obkv_buffer.clone())?
} }
}
}
documents_count += 1;
progress_callback(UpdateIndexingStep::RemapDocumentAddition { progress_callback(UpdateIndexingStep::RemapDocumentAddition {
documents_seen: documents_count, documents_seen: documents_count,