feat: Add distinction on start of title

This commit is contained in:
Clément Renault 2018-10-18 17:08:51 +02:00
parent 86cb01b243
commit e1936ae3f7

View File

@ -6,7 +6,7 @@ use std::path::PathBuf;
use elapsed::measure_time; use elapsed::measure_time;
use rocksdb::{DB, DBOptions, IngestExternalFileOptions}; use rocksdb::{DB, DBOptions, IngestExternalFileOptions};
use raptor::rank::{criterion, Config, RankedStream, Document}; use raptor::rank::{criterion, Config, RankedStream, Document};
use raptor::{automaton, Metadata, CommonWords}; use raptor::{automaton, DocumentId, Metadata, CommonWords};
#[derive(Debug, StructOpt)] #[derive(Debug, StructOpt)]
pub struct CommandConsole { pub struct CommandConsole {
@ -62,13 +62,6 @@ impl ConsoleSearch {
} }
} }
// "Sony" "PlayStation 4 500GB"
fn starts_with_playstation(doc: &Document, database: &DB) -> Vec<u8> {
let title_key = format!("{}-title", doc.id);
let title = database.get(title_key.as_bytes()).unwrap().unwrap();
title.get(0..4).map(|s| s.to_vec()).unwrap_or(Vec::new())
}
fn search(metadata: &Metadata, database: &DB, common_words: &CommonWords, query: &str) { fn search(metadata: &Metadata, database: &DB, common_words: &CommonWords, query: &str) {
let mut automatons = Vec::new(); let mut automatons = Vec::new();
for query in query.split_whitespace().filter(|q| !common_words.contains(*q)) { for query in query.split_whitespace().filter(|q| !common_words.contains(*q)) {
@ -76,16 +69,31 @@ fn search(metadata: &Metadata, database: &DB, common_words: &CommonWords, query:
automatons.push(lev); automatons.push(lev);
} }
let distinct_by_title_first_four_chars = |id: &DocumentId| {
let title_key = format!("{}-title", id);
match database.get(title_key.as_bytes()) {
Ok(Some(value)) => {
value.to_utf8().map(|s| s.chars().take(4).collect::<String>())
},
Ok(None) => None,
Err(err) => {
eprintln!("{:?}", err);
None
}
}
};
// "Sony" "PlayStation 4 500GB"
let config = Config { let config = Config {
metadata: metadata, metadata: metadata,
automatons: automatons, automatons: automatons,
criteria: criterion::default(), criteria: criterion::default(),
distinct: ((), 1), distinct: (distinct_by_title_first_four_chars, 1),
}; };
let stream = RankedStream::new(config); let stream = RankedStream::new(config);
// let documents = stream.retrieve_distinct_documents(|doc| starts_with_playstation(doc, database), 0..20); let documents = stream.retrieve_distinct_documents(0..20);
let documents = stream.retrieve_documents(0..20); // let documents = stream.retrieve_documents(0..20);
for document in documents { for document in documents {
let id_key = format!("{}-id", document.id); let id_key = format!("{}-id", document.id);