mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-26 12:05:05 +08:00
Reintroduce a simple HTTP server
This commit is contained in:
parent
2a10b2275e
commit
a26553c90a
1048
Cargo.lock
generated
1048
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@ -26,5 +26,10 @@ smallvec = "1.4.0"
|
|||||||
structopt = { version = "0.3.14", default-features = false }
|
structopt = { version = "0.3.14", default-features = false }
|
||||||
tempfile = "3.1.0"
|
tempfile = "3.1.0"
|
||||||
|
|
||||||
|
# http server
|
||||||
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
|
tokio = { version = "0.2.15", features = ["full"] }
|
||||||
|
warp = "0.2.2"
|
||||||
|
|
||||||
[profile.release]
|
[profile.release]
|
||||||
debug = true
|
debug = true
|
||||||
|
1
public/bulma.min.css
vendored
Normal file
1
public/bulma.min.css
vendored
Normal file
File diff suppressed because one or more lines are too long
199
public/index.html
Normal file
199
public/index.html
Normal file
@ -0,0 +1,199 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<link rel="stylesheet" href="/bulma.min.css">
|
||||||
|
<script type="text/javascript" src="/jquery-3.4.1.min.js"></script>
|
||||||
|
<script type="text/javascript" src="/papaparse.min.js"></script>
|
||||||
|
<title>The daugt</title>
|
||||||
|
<style>
|
||||||
|
em {
|
||||||
|
color: hsl(204, 86%, 25%);
|
||||||
|
font-style: inherit;
|
||||||
|
background-color: hsl(204, 86%, 88%);
|
||||||
|
}
|
||||||
|
|
||||||
|
#results {
|
||||||
|
max-width: 900px;
|
||||||
|
margin: 20px auto 0 auto;
|
||||||
|
padding: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.notification {
|
||||||
|
display: flex;
|
||||||
|
justify-content: center;
|
||||||
|
}
|
||||||
|
|
||||||
|
.level-left {
|
||||||
|
margin-right: 50px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.document {
|
||||||
|
padding: 20px 20px;
|
||||||
|
background-color: #f5f5f5;
|
||||||
|
border-radius: 4px;
|
||||||
|
margin-bottom: 20px;
|
||||||
|
display: flex;
|
||||||
|
}
|
||||||
|
|
||||||
|
.document ol {
|
||||||
|
flex: 0 0 75%;
|
||||||
|
max-width: 75%;
|
||||||
|
padding: 0;
|
||||||
|
margin: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.document .image {
|
||||||
|
max-width: 25%;
|
||||||
|
flex: 0 0 25%;
|
||||||
|
padding-left: 30px;
|
||||||
|
box-sizing: border-box;
|
||||||
|
}
|
||||||
|
|
||||||
|
.document .image img {
|
||||||
|
width: 100%;
|
||||||
|
}
|
||||||
|
|
||||||
|
.field {
|
||||||
|
list-style-type: none;
|
||||||
|
display: flex;
|
||||||
|
flex-wrap: wrap;
|
||||||
|
}
|
||||||
|
|
||||||
|
.field:not(:last-child) {
|
||||||
|
margin-bottom: 7px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.attribute {
|
||||||
|
flex: 0 0 35%;
|
||||||
|
max-width: 35%;
|
||||||
|
text-align: right;
|
||||||
|
padding-right: 10px;
|
||||||
|
box-sizing: border-box;
|
||||||
|
text-transform: uppercase;
|
||||||
|
color: rgba(0,0,0,.7);
|
||||||
|
}
|
||||||
|
|
||||||
|
.content {
|
||||||
|
max-width: 65%;
|
||||||
|
flex: 0 0 65%;
|
||||||
|
box-sizing: border-box;
|
||||||
|
padding-left: 10px;
|
||||||
|
color: rgba(0,0,0,.9);
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
|
||||||
|
<section class="hero is-light">
|
||||||
|
<div class="hero-body">
|
||||||
|
<div class="container">
|
||||||
|
<h1 class="title">
|
||||||
|
Welcome to daugt
|
||||||
|
</h1>
|
||||||
|
<h2 class="subtitle">
|
||||||
|
This dashboard will help you check the search results with ease.
|
||||||
|
</h2>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section class="hero container">
|
||||||
|
<div class="notification" style="border-radius: 0 0 4px 4px;">
|
||||||
|
|
||||||
|
<nav class="level">
|
||||||
|
<!-- Left side -->
|
||||||
|
<div class="level-left">
|
||||||
|
<div class="level-item">
|
||||||
|
<div class="field has-addons has-addons-right">
|
||||||
|
<input id="search" class="input" type="text" autofocus placeholder="e.g. George Clooney">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Right side -->
|
||||||
|
<nav class="level-right">
|
||||||
|
<div class="level-item has-text-centered">
|
||||||
|
<div>
|
||||||
|
<p class="heading">Documents</p>
|
||||||
|
<p id="count" class="title">25</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="level-item has-text-centered">
|
||||||
|
<div>
|
||||||
|
<p class="heading">Time Spent</p>
|
||||||
|
<p id="time" class="title">4ms</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</nav>
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<ol id="results" class="content">
|
||||||
|
<!-- documents matching requests -->
|
||||||
|
</ol>
|
||||||
|
</section>
|
||||||
|
</body>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
var request = null;
|
||||||
|
|
||||||
|
$('#search').on('input', function () {
|
||||||
|
var query = $(this).val();
|
||||||
|
request = $.ajax({
|
||||||
|
type: "POST",
|
||||||
|
url: "query",
|
||||||
|
contentType: 'application/json',
|
||||||
|
data: JSON.stringify({ 'query': query }),
|
||||||
|
contentType: 'application/json',
|
||||||
|
success: function (data, textStatus, request) {
|
||||||
|
let httpResults = Papa.parse(data, { header: true, skipEmptyLines: true });
|
||||||
|
results.innerHTML = '';
|
||||||
|
|
||||||
|
let timeSpent = request.getResponseHeader('Time-Ms');
|
||||||
|
let numberOfDocuments = httpResults.data.length;
|
||||||
|
count.innerHTML = `${numberOfDocuments}`;
|
||||||
|
time.innerHTML = `${timeSpent}ms`;
|
||||||
|
|
||||||
|
for (element of httpResults.data) {
|
||||||
|
const elem = document.createElement('li');
|
||||||
|
elem.classList.add("document");
|
||||||
|
|
||||||
|
const ol = document.createElement('ol');
|
||||||
|
|
||||||
|
for (const prop in element) {
|
||||||
|
const field = document.createElement('li');
|
||||||
|
field.classList.add("field");
|
||||||
|
|
||||||
|
const attribute = document.createElement('div');
|
||||||
|
attribute.classList.add("attribute");
|
||||||
|
attribute.innerHTML = prop;
|
||||||
|
|
||||||
|
const content = document.createElement('div');
|
||||||
|
content.classList.add("content");
|
||||||
|
content.innerHTML = element[prop];
|
||||||
|
|
||||||
|
field.appendChild(attribute);
|
||||||
|
field.appendChild(content);
|
||||||
|
|
||||||
|
ol.appendChild(field);
|
||||||
|
}
|
||||||
|
|
||||||
|
elem.appendChild(ol);
|
||||||
|
results.appendChild(elem)
|
||||||
|
}
|
||||||
|
|
||||||
|
},
|
||||||
|
beforeSend: function () {
|
||||||
|
if (request !== null) {
|
||||||
|
request.abort();
|
||||||
|
}
|
||||||
|
},
|
||||||
|
});
|
||||||
|
});
|
||||||
|
</script>
|
||||||
|
</html>
|
2
public/jquery-3.4.1.min.js
vendored
Normal file
2
public/jquery-3.4.1.min.js
vendored
Normal file
File diff suppressed because one or more lines are too long
7
public/papaparse.min.js
vendored
Executable file
7
public/papaparse.min.js
vendored
Executable file
File diff suppressed because one or more lines are too long
@ -2,16 +2,9 @@ use std::io::{self, Write};
|
|||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
|
|
||||||
use cow_utils::CowUtils;
|
use heed::EnvOpenOptions;
|
||||||
use fst::{Streamer, IntoStreamer};
|
|
||||||
use heed::types::*;
|
|
||||||
use heed::{EnvOpenOptions, Database};
|
|
||||||
use levenshtein_automata::LevenshteinAutomatonBuilder;
|
|
||||||
use roaring::RoaringBitmap;
|
|
||||||
use structopt::StructOpt;
|
use structopt::StructOpt;
|
||||||
|
use mega_mini_indexer::{Index, BEU32};
|
||||||
use mega_mini_indexer::alphanumeric_tokens;
|
|
||||||
use mega_mini_indexer::BEU32;
|
|
||||||
|
|
||||||
#[derive(Debug, StructOpt)]
|
#[derive(Debug, StructOpt)]
|
||||||
#[structopt(name = "mm-indexer", about = "The server side of the daugt project.")]
|
#[structopt(name = "mm-indexer", about = "The server side of the daugt project.")]
|
||||||
@ -35,78 +28,27 @@ fn main() -> anyhow::Result<()> {
|
|||||||
.max_dbs(5)
|
.max_dbs(5)
|
||||||
.open(opt.database)?;
|
.open(opt.database)?;
|
||||||
|
|
||||||
let main = env.create_poly_database(None)?;
|
let index = Index::new(&env)?;
|
||||||
let postings_ids: Database<Str, ByteSlice> = env.create_database(Some("postings-ids"))?;
|
|
||||||
let documents: Database<OwnedType<BEU32>, ByteSlice> = env.create_database(Some("documents"))?;
|
|
||||||
|
|
||||||
|
let before = Instant::now();
|
||||||
let rtxn = env.read_txn()?;
|
let rtxn = env.read_txn()?;
|
||||||
let headers = match main.get::<_, Str, ByteSlice>(&rtxn, "headers")? {
|
|
||||||
|
let documents_ids = index.search(&rtxn, &opt.query)?;
|
||||||
|
let headers = match index.headers(&rtxn)? {
|
||||||
Some(headers) => headers,
|
Some(headers) => headers,
|
||||||
None => return Ok(()),
|
None => return Ok(()),
|
||||||
};
|
};
|
||||||
|
|
||||||
let fst = match main.get::<_, Str, ByteSlice>(&rtxn, "words-fst")? {
|
|
||||||
Some(bytes) => fst::Set::new(bytes)?,
|
|
||||||
None => return Ok(()),
|
|
||||||
};
|
|
||||||
|
|
||||||
// Building these factories is not free.
|
|
||||||
let lev0 = LevenshteinAutomatonBuilder::new(0, true);
|
|
||||||
let lev1 = LevenshteinAutomatonBuilder::new(1, true);
|
|
||||||
let lev2 = LevenshteinAutomatonBuilder::new(2, true);
|
|
||||||
|
|
||||||
let words: Vec<_> = alphanumeric_tokens(&opt.query).collect();
|
|
||||||
let number_of_words = words.len();
|
|
||||||
let dfas = words.into_iter().enumerate().map(|(i, word)| {
|
|
||||||
let word = word.cow_to_lowercase();
|
|
||||||
let is_last = i + 1 == number_of_words;
|
|
||||||
let dfa = match word.len() {
|
|
||||||
0..=4 => if is_last { lev0.build_prefix_dfa(&word) } else { lev0.build_dfa(&word) },
|
|
||||||
5..=8 => if is_last { lev1.build_prefix_dfa(&word) } else { lev1.build_dfa(&word) },
|
|
||||||
_ => if is_last { lev2.build_prefix_dfa(&word) } else { lev2.build_dfa(&word) },
|
|
||||||
};
|
|
||||||
(word, dfa)
|
|
||||||
});
|
|
||||||
|
|
||||||
let before = Instant::now();
|
|
||||||
let mut intersect_result: Option<RoaringBitmap> = None;
|
|
||||||
for (word, dfa) in dfas {
|
|
||||||
let before = Instant::now();
|
|
||||||
let mut union_result = RoaringBitmap::default();
|
|
||||||
let mut stream = fst.search(dfa).into_stream();
|
|
||||||
while let Some(word) = stream.next() {
|
|
||||||
let word = std::str::from_utf8(word)?;
|
|
||||||
if let Some(ids) = postings_ids.get(&rtxn, word)? {
|
|
||||||
let right = RoaringBitmap::deserialize_from(ids)?;
|
|
||||||
union_result.union_with(&right);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
eprintln!("union for {:?} took {:.02?}", word, before.elapsed());
|
|
||||||
|
|
||||||
intersect_result = match intersect_result.take() {
|
|
||||||
Some(mut left) => {
|
|
||||||
let before = Instant::now();
|
|
||||||
let left_len = left.len();
|
|
||||||
left.intersect_with(&union_result);
|
|
||||||
eprintln!("intersect between {:?} and {:?} took {:.02?}",
|
|
||||||
left_len, union_result.len(), before.elapsed());
|
|
||||||
Some(left)
|
|
||||||
},
|
|
||||||
None => Some(union_result),
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut stdout = io::stdout();
|
let mut stdout = io::stdout();
|
||||||
stdout.write_all(&headers)?;
|
stdout.write_all(&headers)?;
|
||||||
|
|
||||||
let total_length = intersect_result.as_ref().map_or(0, |x| x.len());
|
for id in &documents_ids {
|
||||||
for id in intersect_result.unwrap_or_default().iter().take(20) {
|
if let Some(content) = index.documents.get(&rtxn, &BEU32::new(*id))? {
|
||||||
if let Some(content) = documents.get(&rtxn, &BEU32::new(id))? {
|
|
||||||
stdout.write_all(&content)?;
|
stdout.write_all(&content)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
eprintln!("Took {:.02?} to find {} documents", before.elapsed(), total_length);
|
eprintln!("Took {:.02?} to find {} documents", before.elapsed(), documents_ids.len());
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
115
src/bin/serve.rs
Normal file
115
src/bin/serve.rs
Normal file
@ -0,0 +1,115 @@
|
|||||||
|
use std::net::SocketAddr;
|
||||||
|
use std::path::PathBuf;
|
||||||
|
use std::str::FromStr;
|
||||||
|
use std::time::Instant;
|
||||||
|
|
||||||
|
use heed::EnvOpenOptions;
|
||||||
|
use serde::Deserialize;
|
||||||
|
use structopt::StructOpt;
|
||||||
|
use warp::{Filter, http::Response};
|
||||||
|
|
||||||
|
use mega_mini_indexer::{BEU32, Index};
|
||||||
|
|
||||||
|
#[derive(Debug, StructOpt)]
|
||||||
|
#[structopt(name = "mmi", about = "The server side of the mmi project.")]
|
||||||
|
struct Opt {
|
||||||
|
/// The database path where the LMDB database is located.
|
||||||
|
/// It is created if it doesn't already exist.
|
||||||
|
#[structopt(long = "db", parse(from_os_str))]
|
||||||
|
database: PathBuf,
|
||||||
|
|
||||||
|
/// The maximum size the database can take on disk. It is recommended to specify
|
||||||
|
/// the whole disk space (value must be a multiple of a page size).
|
||||||
|
#[structopt(long = "db-size", default_value = "107374182400")] // 100 GB
|
||||||
|
database_size: usize,
|
||||||
|
|
||||||
|
/// The ip and port on which the database will listen for HTTP requests.
|
||||||
|
#[structopt(short = "l", long, default_value = "127.0.0.1:9700")]
|
||||||
|
http_listen_addr: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> anyhow::Result<()> {
|
||||||
|
let opt = Opt::from_args();
|
||||||
|
|
||||||
|
std::fs::create_dir_all(&opt.database)?;
|
||||||
|
let env = EnvOpenOptions::new()
|
||||||
|
.map_size(opt.database_size)
|
||||||
|
.max_dbs(10)
|
||||||
|
.open(&opt.database)?;
|
||||||
|
|
||||||
|
let index = Index::new(&env)?;
|
||||||
|
|
||||||
|
// We run and wait on the HTTP server
|
||||||
|
|
||||||
|
// Expose an HTML page to debug the search in a browser
|
||||||
|
let dash_html_route = warp::filters::method::get()
|
||||||
|
.and(warp::filters::path::end())
|
||||||
|
.map(|| warp::reply::html(include_str!("../../public/index.html")));
|
||||||
|
|
||||||
|
let dash_bulma_route = warp::filters::method::get()
|
||||||
|
.and(warp::path!("bulma.min.css"))
|
||||||
|
.map(|| Response::builder()
|
||||||
|
.header("content-type", "text/css; charset=utf-8")
|
||||||
|
.body(include_str!("../../public/bulma.min.css"))
|
||||||
|
);
|
||||||
|
|
||||||
|
let dash_jquery_route = warp::filters::method::get()
|
||||||
|
.and(warp::path!("jquery-3.4.1.min.js"))
|
||||||
|
.map(|| Response::builder()
|
||||||
|
.header("content-type", "application/javascript; charset=utf-8")
|
||||||
|
.body(include_str!("../../public/jquery-3.4.1.min.js"))
|
||||||
|
);
|
||||||
|
|
||||||
|
let dash_papaparse_route = warp::filters::method::get()
|
||||||
|
.and(warp::path!("papaparse.min.js"))
|
||||||
|
.map(|| Response::builder()
|
||||||
|
.header("content-type", "application/javascript; charset=utf-8")
|
||||||
|
.body(include_str!("../../public/papaparse.min.js"))
|
||||||
|
);
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct QueryBody {
|
||||||
|
query: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
let env_cloned = env.clone();
|
||||||
|
let query_route = warp::filters::method::post()
|
||||||
|
.and(warp::path!("query"))
|
||||||
|
.and(warp::body::json())
|
||||||
|
.map(move |query: QueryBody| {
|
||||||
|
let before_search = Instant::now();
|
||||||
|
let rtxn = env_cloned.read_txn().unwrap();
|
||||||
|
|
||||||
|
let documents_ids = index.search(&rtxn, &query.query).unwrap();
|
||||||
|
|
||||||
|
let mut body = Vec::new();
|
||||||
|
if let Some(headers) = index.headers(&rtxn).unwrap() {
|
||||||
|
// We write the headers
|
||||||
|
body.extend_from_slice(headers);
|
||||||
|
|
||||||
|
for id in documents_ids {
|
||||||
|
if let Some(content) = index.documents.get(&rtxn, &BEU32::new(id)).unwrap() {
|
||||||
|
body.extend_from_slice(&content);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Response::builder()
|
||||||
|
.header("Content-Type", "text/csv")
|
||||||
|
.header("Time-Ms", before_search.elapsed().as_millis().to_string())
|
||||||
|
.body(String::from_utf8(body).unwrap())
|
||||||
|
});
|
||||||
|
|
||||||
|
let routes = dash_html_route
|
||||||
|
.or(dash_bulma_route)
|
||||||
|
.or(dash_jquery_route)
|
||||||
|
.or(dash_papaparse_route)
|
||||||
|
.or(query_route);
|
||||||
|
|
||||||
|
let addr = SocketAddr::from_str(&opt.http_listen_addr).unwrap();
|
||||||
|
eprintln!("listening on http://{}", addr);
|
||||||
|
warp::serve(routes).run(addr).await;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
86
src/lib.rs
86
src/lib.rs
@ -1,7 +1,14 @@
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::hash::BuildHasherDefault;
|
use std::hash::BuildHasherDefault;
|
||||||
|
use std::time::Instant;
|
||||||
|
|
||||||
|
use cow_utils::CowUtils;
|
||||||
|
use fst::{IntoStreamer, Streamer};
|
||||||
use fxhash::FxHasher32;
|
use fxhash::FxHasher32;
|
||||||
|
use heed::types::*;
|
||||||
|
use heed::{PolyDatabase, Database};
|
||||||
|
use levenshtein_automata::LevenshteinAutomatonBuilder;
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
use slice_group_by::StrGroupBy;
|
use slice_group_by::StrGroupBy;
|
||||||
|
|
||||||
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
|
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
|
||||||
@ -14,3 +21,82 @@ pub fn alphanumeric_tokens(string: &str) -> impl Iterator<Item = &str> {
|
|||||||
let is_alphanumeric = |s: &&str| s.chars().next().map_or(false, char::is_alphanumeric);
|
let is_alphanumeric = |s: &&str| s.chars().next().map_or(false, char::is_alphanumeric);
|
||||||
string.linear_group_by_key(|c| c.is_alphanumeric()).filter(is_alphanumeric)
|
string.linear_group_by_key(|c| c.is_alphanumeric()).filter(is_alphanumeric)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct Index {
|
||||||
|
pub main: PolyDatabase,
|
||||||
|
pub postings_ids: Database<Str, ByteSlice>,
|
||||||
|
pub documents: Database<OwnedType<BEU32>, ByteSlice>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Index {
|
||||||
|
pub fn new(env: &heed::Env) -> heed::Result<Index> {
|
||||||
|
let main = env.create_poly_database(None)?;
|
||||||
|
let postings_ids = env.create_database(Some("postings-ids"))?;
|
||||||
|
let documents = env.create_database(Some("documents"))?;
|
||||||
|
|
||||||
|
Ok(Index {
|
||||||
|
main,
|
||||||
|
postings_ids,
|
||||||
|
documents,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn headers<'t>(&self, rtxn: &'t heed::RoTxn) -> heed::Result<Option<&'t [u8]>> {
|
||||||
|
self.main.get::<_, Str, ByteSlice>(rtxn, "headers")
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn search(&self, rtxn: &heed::RoTxn, query: &str) -> anyhow::Result<Vec<DocumentId>> {
|
||||||
|
let fst = match self.main.get::<_, Str, ByteSlice>(rtxn, "words-fst")? {
|
||||||
|
Some(bytes) => fst::Set::new(bytes)?,
|
||||||
|
None => return Ok(Vec::new()),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Building these factories is not free.
|
||||||
|
let lev0 = LevenshteinAutomatonBuilder::new(0, true);
|
||||||
|
let lev1 = LevenshteinAutomatonBuilder::new(1, true);
|
||||||
|
let lev2 = LevenshteinAutomatonBuilder::new(2, true);
|
||||||
|
|
||||||
|
let words: Vec<_> = alphanumeric_tokens(query).collect();
|
||||||
|
let number_of_words = words.len();
|
||||||
|
let dfas = words.into_iter().enumerate().map(|(i, word)| {
|
||||||
|
let word = word.cow_to_lowercase();
|
||||||
|
let is_last = i + 1 == number_of_words;
|
||||||
|
let dfa = match word.len() {
|
||||||
|
0..=4 => if is_last { lev0.build_prefix_dfa(&word) } else { lev0.build_dfa(&word) },
|
||||||
|
5..=8 => if is_last { lev1.build_prefix_dfa(&word) } else { lev1.build_dfa(&word) },
|
||||||
|
_ => if is_last { lev2.build_prefix_dfa(&word) } else { lev2.build_dfa(&word) },
|
||||||
|
};
|
||||||
|
(word, dfa)
|
||||||
|
});
|
||||||
|
|
||||||
|
let mut intersect_result: Option<RoaringBitmap> = None;
|
||||||
|
for (word, dfa) in dfas {
|
||||||
|
let before = Instant::now();
|
||||||
|
let mut union_result = RoaringBitmap::default();
|
||||||
|
let mut stream = fst.search(dfa).into_stream();
|
||||||
|
while let Some(word) = stream.next() {
|
||||||
|
let word = std::str::from_utf8(word)?;
|
||||||
|
if let Some(ids) = self.postings_ids.get(rtxn, word)? {
|
||||||
|
let right = RoaringBitmap::deserialize_from(ids)?;
|
||||||
|
union_result.union_with(&right);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
eprintln!("union for {:?} took {:.02?}", word, before.elapsed());
|
||||||
|
|
||||||
|
intersect_result = match intersect_result.take() {
|
||||||
|
Some(mut left) => {
|
||||||
|
let before = Instant::now();
|
||||||
|
let left_len = left.len();
|
||||||
|
left.intersect_with(&union_result);
|
||||||
|
eprintln!("intersect between {:?} and {:?} took {:.02?}",
|
||||||
|
left_len, union_result.len(), before.elapsed());
|
||||||
|
Some(left)
|
||||||
|
},
|
||||||
|
None => Some(union_result),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(intersect_result.unwrap_or_default().iter().take(20).collect())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user