
256 lines
8.5 KiB
Raw Normal View History

use std::borrow::Cow;
use std::collections::HashSet;
2020-07-11 14:17:37 +02:00
use std::fs::File;
2020-05-31 17:48:13 +02:00
use std::net::SocketAddr;
use std::path::PathBuf;
use std::str::FromStr;
2020-08-07 13:11:31 +02:00
use std::sync::Arc;
2020-05-31 17:48:13 +02:00
use std::time::Instant;
2020-07-11 14:17:37 +02:00
use askama_warp::Template;
2020-05-31 17:48:13 +02:00
use heed::EnvOpenOptions;
2020-08-07 13:11:31 +02:00
use oxidized_mtbl::Reader;
2020-05-31 17:48:13 +02:00
use serde::Deserialize;
use slice_group_by::StrGroupBy;
2020-05-31 17:48:13 +02:00
use structopt::StructOpt;
use warp::{Filter, http::Response};
use milli::Index;
2020-05-31 17:48:13 +02:00
#[cfg(target_os = "linux")]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
2020-05-31 17:48:13 +02:00
#[derive(Debug, StructOpt)]
#[structopt(name = "milli", about = "The server binary of the milli project.")]
2020-05-31 17:48:13 +02:00
struct Opt {
/// The database path where the LMDB database is located.
/// It is created if it doesn't already exist.
#[structopt(long = "db", parse(from_os_str))]
database: PathBuf,
/// The maximum size the database can take on disk. It is recommended to specify
/// the whole disk space (value must be a multiple of a page size).
#[structopt(long = "db-size", default_value = "107374182400")] // 100 GB
database_size: usize,
/// Disable document highlighting on the dashboard.
disable_highlighting: bool,
/// Verbose mode (-v, -vv, -vvv, etc.)
#[structopt(short, long, parse(from_occurrences))]
verbose: usize,
2020-05-31 17:48:13 +02:00
/// The ip and port on which the database will listen for HTTP requests.
#[structopt(short = "l", long, default_value = "")]
http_listen_addr: String,
fn highlight_string(string: &str, words: &HashSet<String>) -> String {
let mut output = String::new();
for token in string.linear_group_by_key(|c| c.is_alphanumeric()) {
let lowercase_token = token.to_lowercase();
let to_highlight = words.contains(&lowercase_token);
if to_highlight { output.push_str("<mark>") }
if to_highlight { output.push_str("</mark>") }
2020-08-07 13:11:31 +02:00
// TODO find a better way or move this elsewhere
struct TransitiveArc<T>(Arc<T>);
impl<T: AsRef<[u8]>> AsRef<[u8]> for TransitiveArc<T> {
fn as_ref(&self) -> &[u8] {
impl<T> Clone for TransitiveArc<T> {
fn clone(&self) -> TransitiveArc<T> {
2020-07-11 14:17:37 +02:00
#[template(path = "index.html")]
struct IndexTemplate {
db_name: String,
db_size: usize,
docs_count: usize,
2020-05-31 17:48:13 +02:00
async fn main() -> anyhow::Result<()> {
let opt = Opt::from_args();
2020-05-31 17:48:13 +02:00
let env = EnvOpenOptions::new()
2020-08-07 13:11:31 +02:00
// Open the LMDB database.
2020-05-31 17:48:13 +02:00
let index = Index::new(&env)?;
2020-08-07 13:11:31 +02:00
// Open the documents MTBL database.
let path = opt.database.join("documents.mtbl");
let file = File::open(path)?;
let mmap = unsafe { memmap::Mmap::map(&file)? };
let mmap = TransitiveArc(Arc::new(mmap));
let documents = Reader::new(mmap)?;
// Retrieve the database the file stem (w/o the extension),
// the disk file size and the number of documents in the database.
2020-07-11 14:17:37 +02:00
let db_name = opt.database.file_stem().and_then(|s| s.to_str()).unwrap_or("").to_string();
let db_size = File::open(opt.database.join("data.mdb"))?.metadata()?.len() as usize;
2020-08-07 13:11:31 +02:00
// Retrieve the documents count.
let docs_count = documents.metadata().count_entries;
2020-07-11 14:17:37 +02:00
2020-05-31 17:48:13 +02:00
// We run and wait on the HTTP server
// Expose an HTML page to debug the search in a browser
let dash_html_route = warp::filters::method::get()
2020-07-11 14:17:37 +02:00
.map(move || {
IndexTemplate {
db_name: db_name.clone(),
docs_count: docs_count as usize,
2020-07-11 14:17:37 +02:00
2020-05-31 17:48:13 +02:00
let dash_bulma_route = warp::filters::method::get()
.map(|| Response::builder()
.header("content-type", "text/css; charset=utf-8")
2020-07-13 23:51:41 +02:00
let dash_bulma_dark_route = warp::filters::method::get()
.map(|| Response::builder()
.header("content-type", "text/css; charset=utf-8")
let dash_style_route = warp::filters::method::get()
.map(|| Response::builder()
.header("content-type", "text/css; charset=utf-8")
2020-05-31 17:48:13 +02:00
let dash_jquery_route = warp::filters::method::get()
.map(|| Response::builder()
.header("content-type", "application/javascript; charset=utf-8")
let dash_papaparse_route = warp::filters::method::get()
.map(|| Response::builder()
.header("content-type", "application/javascript; charset=utf-8")
2020-07-11 14:17:37 +02:00
let dash_filesize_route = warp::filters::method::get()
.map(|| Response::builder()
.header("content-type", "application/javascript; charset=utf-8")
let dash_script_route = warp::filters::method::get()
.map(|| Response::builder()
.header("content-type", "application/javascript; charset=utf-8")
2020-07-15 23:51:12 +02:00
let dash_logo_white_route = warp::filters::method::get()
.map(|| Response::builder()
.header("content-type", "image/svg+xml")
let dash_logo_black_route = warp::filters::method::get()
.map(|| Response::builder()
.header("content-type", "image/svg+xml")
2020-05-31 17:48:13 +02:00
struct QueryBody {
query: String,
let env_cloned = env.clone();
2020-08-07 13:11:31 +02:00
let documents_cloned = documents.clone();
let disable_highlighting = opt.disable_highlighting;
2020-05-31 17:48:13 +02:00
let query_route = warp::filters::method::post()
.map(move |query: QueryBody| {
let before_search = Instant::now();
let rtxn = env_cloned.read_txn().unwrap();
2020-07-14 11:51:21 +02:00
let (words, documents_ids) =, &query.query).unwrap();
2020-05-31 17:48:13 +02:00
let mut body = Vec::new();
if let Some(headers) = index.headers(&rtxn).unwrap() {
// We write the headers
for id in documents_ids {
let id_bytes = id.to_be_bytes();
2020-08-07 13:11:31 +02:00
let content = documents_cloned.clone().get(&id_bytes).unwrap();
2020-06-11 11:55:03 +02:00
let content = content.expect(&format!("could not find document {}", id));
let content = std::str::from_utf8(content.as_ref()).unwrap();
let content = if disable_highlighting {
} else {
Cow::from(highlight_string(content, &words))
2020-05-31 17:48:13 +02:00
.header("Content-Type", "text/csv")
.header("Time-Ms", before_search.elapsed().as_millis().to_string())
let routes = dash_html_route
2020-07-13 23:51:41 +02:00
2020-05-31 17:48:13 +02:00
2020-07-11 14:17:37 +02:00
2020-07-15 23:51:12 +02:00
2020-05-31 17:48:13 +02:00
let addr = SocketAddr::from_str(&opt.http_listen_addr).unwrap();