Reintroduce a simple HTTP server

This commit is contained in:
Kerollmops 2020-05-31 17:48:13 +02:00
parent 2a10b2275e
commit a26553c90a
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
9 changed files with 1458 additions and 83 deletions

1048
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -26,5 +26,10 @@ smallvec = "1.4.0"
structopt = { version = "0.3.14", default-features = false } structopt = { version = "0.3.14", default-features = false }
tempfile = "3.1.0" tempfile = "3.1.0"
# http server
serde = { version = "1.0", features = ["derive"] }
tokio = { version = "0.2.15", features = ["full"] }
warp = "0.2.2"
[profile.release] [profile.release]
debug = true debug = true

1
public/bulma.min.css vendored Normal file

File diff suppressed because one or more lines are too long

199
public/index.html Normal file
View File

@ -0,0 +1,199 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<link rel="stylesheet" href="/bulma.min.css">
<script type="text/javascript" src="/jquery-3.4.1.min.js"></script>
<script type="text/javascript" src="/papaparse.min.js"></script>
<title>The daugt</title>
<style>
em {
color: hsl(204, 86%, 25%);
font-style: inherit;
background-color: hsl(204, 86%, 88%);
}
#results {
max-width: 900px;
margin: 20px auto 0 auto;
padding: 0;
}
.notification {
display: flex;
justify-content: center;
}
.level-left {
margin-right: 50px;
}
.document {
padding: 20px 20px;
background-color: #f5f5f5;
border-radius: 4px;
margin-bottom: 20px;
display: flex;
}
.document ol {
flex: 0 0 75%;
max-width: 75%;
padding: 0;
margin: 0;
}
.document .image {
max-width: 25%;
flex: 0 0 25%;
padding-left: 30px;
box-sizing: border-box;
}
.document .image img {
width: 100%;
}
.field {
list-style-type: none;
display: flex;
flex-wrap: wrap;
}
.field:not(:last-child) {
margin-bottom: 7px;
}
.attribute {
flex: 0 0 35%;
max-width: 35%;
text-align: right;
padding-right: 10px;
box-sizing: border-box;
text-transform: uppercase;
color: rgba(0,0,0,.7);
}
.content {
max-width: 65%;
flex: 0 0 65%;
box-sizing: border-box;
padding-left: 10px;
color: rgba(0,0,0,.9);
}
</style>
</head>
<body>
<section class="hero is-light">
<div class="hero-body">
<div class="container">
<h1 class="title">
Welcome to daugt
</h1>
<h2 class="subtitle">
This dashboard will help you check the search results with ease.
</h2>
</div>
</div>
</section>
<section class="hero container">
<div class="notification" style="border-radius: 0 0 4px 4px;">
<nav class="level">
<!-- Left side -->
<div class="level-left">
<div class="level-item">
<div class="field has-addons has-addons-right">
<input id="search" class="input" type="text" autofocus placeholder="e.g. George Clooney">
</div>
</div>
</div>
<!-- Right side -->
<nav class="level-right">
<div class="level-item has-text-centered">
<div>
<p class="heading">Documents</p>
<p id="count" class="title">25</p>
</div>
</div>
<div class="level-item has-text-centered">
<div>
<p class="heading">Time Spent</p>
<p id="time" class="title">4ms</p>
</div>
</div>
</nav>
</nav>
</div>
</section>
<section>
<ol id="results" class="content">
<!-- documents matching requests -->
</ol>
</section>
</body>
<script>
var request = null;
$('#search').on('input', function () {
var query = $(this).val();
request = $.ajax({
type: "POST",
url: "query",
contentType: 'application/json',
data: JSON.stringify({ 'query': query }),
contentType: 'application/json',
success: function (data, textStatus, request) {
let httpResults = Papa.parse(data, { header: true, skipEmptyLines: true });
results.innerHTML = '';
let timeSpent = request.getResponseHeader('Time-Ms');
let numberOfDocuments = httpResults.data.length;
count.innerHTML = `${numberOfDocuments}`;
time.innerHTML = `${timeSpent}ms`;
for (element of httpResults.data) {
const elem = document.createElement('li');
elem.classList.add("document");
const ol = document.createElement('ol');
for (const prop in element) {
const field = document.createElement('li');
field.classList.add("field");
const attribute = document.createElement('div');
attribute.classList.add("attribute");
attribute.innerHTML = prop;
const content = document.createElement('div');
content.classList.add("content");
content.innerHTML = element[prop];
field.appendChild(attribute);
field.appendChild(content);
ol.appendChild(field);
}
elem.appendChild(ol);
results.appendChild(elem)
}
},
beforeSend: function () {
if (request !== null) {
request.abort();
}
},
});
});
</script>
</html>

2
public/jquery-3.4.1.min.js vendored Normal file

File diff suppressed because one or more lines are too long

7
public/papaparse.min.js vendored Executable file

File diff suppressed because one or more lines are too long

View File

@ -2,16 +2,9 @@ use std::io::{self, Write};
use std::path::PathBuf; use std::path::PathBuf;
use std::time::Instant; use std::time::Instant;
use cow_utils::CowUtils; use heed::EnvOpenOptions;
use fst::{Streamer, IntoStreamer};
use heed::types::*;
use heed::{EnvOpenOptions, Database};
use levenshtein_automata::LevenshteinAutomatonBuilder;
use roaring::RoaringBitmap;
use structopt::StructOpt; use structopt::StructOpt;
use mega_mini_indexer::{Index, BEU32};
use mega_mini_indexer::alphanumeric_tokens;
use mega_mini_indexer::BEU32;
#[derive(Debug, StructOpt)] #[derive(Debug, StructOpt)]
#[structopt(name = "mm-indexer", about = "The server side of the daugt project.")] #[structopt(name = "mm-indexer", about = "The server side of the daugt project.")]
@ -35,78 +28,27 @@ fn main() -> anyhow::Result<()> {
.max_dbs(5) .max_dbs(5)
.open(opt.database)?; .open(opt.database)?;
let main = env.create_poly_database(None)?; let index = Index::new(&env)?;
let postings_ids: Database<Str, ByteSlice> = env.create_database(Some("postings-ids"))?;
let documents: Database<OwnedType<BEU32>, ByteSlice> = env.create_database(Some("documents"))?;
let before = Instant::now();
let rtxn = env.read_txn()?; let rtxn = env.read_txn()?;
let headers = match main.get::<_, Str, ByteSlice>(&rtxn, "headers")? {
let documents_ids = index.search(&rtxn, &opt.query)?;
let headers = match index.headers(&rtxn)? {
Some(headers) => headers, Some(headers) => headers,
None => return Ok(()), None => return Ok(()),
}; };
let fst = match main.get::<_, Str, ByteSlice>(&rtxn, "words-fst")? {
Some(bytes) => fst::Set::new(bytes)?,
None => return Ok(()),
};
// Building these factories is not free.
let lev0 = LevenshteinAutomatonBuilder::new(0, true);
let lev1 = LevenshteinAutomatonBuilder::new(1, true);
let lev2 = LevenshteinAutomatonBuilder::new(2, true);
let words: Vec<_> = alphanumeric_tokens(&opt.query).collect();
let number_of_words = words.len();
let dfas = words.into_iter().enumerate().map(|(i, word)| {
let word = word.cow_to_lowercase();
let is_last = i + 1 == number_of_words;
let dfa = match word.len() {
0..=4 => if is_last { lev0.build_prefix_dfa(&word) } else { lev0.build_dfa(&word) },
5..=8 => if is_last { lev1.build_prefix_dfa(&word) } else { lev1.build_dfa(&word) },
_ => if is_last { lev2.build_prefix_dfa(&word) } else { lev2.build_dfa(&word) },
};
(word, dfa)
});
let before = Instant::now();
let mut intersect_result: Option<RoaringBitmap> = None;
for (word, dfa) in dfas {
let before = Instant::now();
let mut union_result = RoaringBitmap::default();
let mut stream = fst.search(dfa).into_stream();
while let Some(word) = stream.next() {
let word = std::str::from_utf8(word)?;
if let Some(ids) = postings_ids.get(&rtxn, word)? {
let right = RoaringBitmap::deserialize_from(ids)?;
union_result.union_with(&right);
}
}
eprintln!("union for {:?} took {:.02?}", word, before.elapsed());
intersect_result = match intersect_result.take() {
Some(mut left) => {
let before = Instant::now();
let left_len = left.len();
left.intersect_with(&union_result);
eprintln!("intersect between {:?} and {:?} took {:.02?}",
left_len, union_result.len(), before.elapsed());
Some(left)
},
None => Some(union_result),
};
}
let mut stdout = io::stdout(); let mut stdout = io::stdout();
stdout.write_all(&headers)?; stdout.write_all(&headers)?;
let total_length = intersect_result.as_ref().map_or(0, |x| x.len()); for id in &documents_ids {
for id in intersect_result.unwrap_or_default().iter().take(20) { if let Some(content) = index.documents.get(&rtxn, &BEU32::new(*id))? {
if let Some(content) = documents.get(&rtxn, &BEU32::new(id))? {
stdout.write_all(&content)?; stdout.write_all(&content)?;
} }
} }
eprintln!("Took {:.02?} to find {} documents", before.elapsed(), total_length); eprintln!("Took {:.02?} to find {} documents", before.elapsed(), documents_ids.len());
Ok(()) Ok(())
} }

115
src/bin/serve.rs Normal file
View File

@ -0,0 +1,115 @@
use std::net::SocketAddr;
use std::path::PathBuf;
use std::str::FromStr;
use std::time::Instant;
use heed::EnvOpenOptions;
use serde::Deserialize;
use structopt::StructOpt;
use warp::{Filter, http::Response};
use mega_mini_indexer::{BEU32, Index};
#[derive(Debug, StructOpt)]
#[structopt(name = "mmi", about = "The server side of the mmi project.")]
struct Opt {
/// The database path where the LMDB database is located.
/// It is created if it doesn't already exist.
#[structopt(long = "db", parse(from_os_str))]
database: PathBuf,
/// The maximum size the database can take on disk. It is recommended to specify
/// the whole disk space (value must be a multiple of a page size).
#[structopt(long = "db-size", default_value = "107374182400")] // 100 GB
database_size: usize,
/// The ip and port on which the database will listen for HTTP requests.
#[structopt(short = "l", long, default_value = "127.0.0.1:9700")]
http_listen_addr: String,
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let opt = Opt::from_args();
std::fs::create_dir_all(&opt.database)?;
let env = EnvOpenOptions::new()
.map_size(opt.database_size)
.max_dbs(10)
.open(&opt.database)?;
let index = Index::new(&env)?;
// We run and wait on the HTTP server
// Expose an HTML page to debug the search in a browser
let dash_html_route = warp::filters::method::get()
.and(warp::filters::path::end())
.map(|| warp::reply::html(include_str!("../../public/index.html")));
let dash_bulma_route = warp::filters::method::get()
.and(warp::path!("bulma.min.css"))
.map(|| Response::builder()
.header("content-type", "text/css; charset=utf-8")
.body(include_str!("../../public/bulma.min.css"))
);
let dash_jquery_route = warp::filters::method::get()
.and(warp::path!("jquery-3.4.1.min.js"))
.map(|| Response::builder()
.header("content-type", "application/javascript; charset=utf-8")
.body(include_str!("../../public/jquery-3.4.1.min.js"))
);
let dash_papaparse_route = warp::filters::method::get()
.and(warp::path!("papaparse.min.js"))
.map(|| Response::builder()
.header("content-type", "application/javascript; charset=utf-8")
.body(include_str!("../../public/papaparse.min.js"))
);
#[derive(Deserialize)]
struct QueryBody {
query: String,
}
let env_cloned = env.clone();
let query_route = warp::filters::method::post()
.and(warp::path!("query"))
.and(warp::body::json())
.map(move |query: QueryBody| {
let before_search = Instant::now();
let rtxn = env_cloned.read_txn().unwrap();
let documents_ids = index.search(&rtxn, &query.query).unwrap();
let mut body = Vec::new();
if let Some(headers) = index.headers(&rtxn).unwrap() {
// We write the headers
body.extend_from_slice(headers);
for id in documents_ids {
if let Some(content) = index.documents.get(&rtxn, &BEU32::new(id)).unwrap() {
body.extend_from_slice(&content);
}
}
}
Response::builder()
.header("Content-Type", "text/csv")
.header("Time-Ms", before_search.elapsed().as_millis().to_string())
.body(String::from_utf8(body).unwrap())
});
let routes = dash_html_route
.or(dash_bulma_route)
.or(dash_jquery_route)
.or(dash_papaparse_route)
.or(query_route);
let addr = SocketAddr::from_str(&opt.http_listen_addr).unwrap();
eprintln!("listening on http://{}", addr);
warp::serve(routes).run(addr).await;
Ok(())
}

View File

@ -1,7 +1,14 @@
use std::collections::HashMap; use std::collections::HashMap;
use std::hash::BuildHasherDefault; use std::hash::BuildHasherDefault;
use std::time::Instant;
use cow_utils::CowUtils;
use fst::{IntoStreamer, Streamer};
use fxhash::FxHasher32; use fxhash::FxHasher32;
use heed::types::*;
use heed::{PolyDatabase, Database};
use levenshtein_automata::LevenshteinAutomatonBuilder;
use roaring::RoaringBitmap;
use slice_group_by::StrGroupBy; use slice_group_by::StrGroupBy;
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>; pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
@ -14,3 +21,82 @@ pub fn alphanumeric_tokens(string: &str) -> impl Iterator<Item = &str> {
let is_alphanumeric = |s: &&str| s.chars().next().map_or(false, char::is_alphanumeric); let is_alphanumeric = |s: &&str| s.chars().next().map_or(false, char::is_alphanumeric);
string.linear_group_by_key(|c| c.is_alphanumeric()).filter(is_alphanumeric) string.linear_group_by_key(|c| c.is_alphanumeric()).filter(is_alphanumeric)
} }
#[derive(Clone)]
pub struct Index {
pub main: PolyDatabase,
pub postings_ids: Database<Str, ByteSlice>,
pub documents: Database<OwnedType<BEU32>, ByteSlice>,
}
impl Index {
pub fn new(env: &heed::Env) -> heed::Result<Index> {
let main = env.create_poly_database(None)?;
let postings_ids = env.create_database(Some("postings-ids"))?;
let documents = env.create_database(Some("documents"))?;
Ok(Index {
main,
postings_ids,
documents,
})
}
pub fn headers<'t>(&self, rtxn: &'t heed::RoTxn) -> heed::Result<Option<&'t [u8]>> {
self.main.get::<_, Str, ByteSlice>(rtxn, "headers")
}
pub fn search(&self, rtxn: &heed::RoTxn, query: &str) -> anyhow::Result<Vec<DocumentId>> {
let fst = match self.main.get::<_, Str, ByteSlice>(rtxn, "words-fst")? {
Some(bytes) => fst::Set::new(bytes)?,
None => return Ok(Vec::new()),
};
// Building these factories is not free.
let lev0 = LevenshteinAutomatonBuilder::new(0, true);
let lev1 = LevenshteinAutomatonBuilder::new(1, true);
let lev2 = LevenshteinAutomatonBuilder::new(2, true);
let words: Vec<_> = alphanumeric_tokens(query).collect();
let number_of_words = words.len();
let dfas = words.into_iter().enumerate().map(|(i, word)| {
let word = word.cow_to_lowercase();
let is_last = i + 1 == number_of_words;
let dfa = match word.len() {
0..=4 => if is_last { lev0.build_prefix_dfa(&word) } else { lev0.build_dfa(&word) },
5..=8 => if is_last { lev1.build_prefix_dfa(&word) } else { lev1.build_dfa(&word) },
_ => if is_last { lev2.build_prefix_dfa(&word) } else { lev2.build_dfa(&word) },
};
(word, dfa)
});
let mut intersect_result: Option<RoaringBitmap> = None;
for (word, dfa) in dfas {
let before = Instant::now();
let mut union_result = RoaringBitmap::default();
let mut stream = fst.search(dfa).into_stream();
while let Some(word) = stream.next() {
let word = std::str::from_utf8(word)?;
if let Some(ids) = self.postings_ids.get(rtxn, word)? {
let right = RoaringBitmap::deserialize_from(ids)?;
union_result.union_with(&right);
}
}
eprintln!("union for {:?} took {:.02?}", word, before.elapsed());
intersect_result = match intersect_result.take() {
Some(mut left) => {
let before = Instant::now();
let left_len = left.len();
left.intersect_with(&union_result);
eprintln!("intersect between {:?} and {:?} took {:.02?}",
left_len, union_result.len(), before.elapsed());
Some(left)
},
None => Some(union_result),
};
}
Ok(intersect_result.unwrap_or_default().iter().take(20).collect())
}
}