2020-07-14 17:27:46 +08:00
|
|
|
use std::borrow::Cow;
|
2020-07-11 20:17:37 +08:00
|
|
|
use std::fs::File;
|
2020-05-31 23:48:13 +08:00
|
|
|
use std::net::SocketAddr;
|
|
|
|
use std::path::PathBuf;
|
|
|
|
use std::str::FromStr;
|
|
|
|
use std::time::Instant;
|
|
|
|
|
2020-07-11 20:17:37 +08:00
|
|
|
use askama_warp::Template;
|
2020-05-31 23:48:13 +08:00
|
|
|
use heed::EnvOpenOptions;
|
2020-07-14 17:27:46 +08:00
|
|
|
use regex::Regex;
|
2020-05-31 23:48:13 +08:00
|
|
|
use serde::Deserialize;
|
|
|
|
use structopt::StructOpt;
|
|
|
|
use warp::{Filter, http::Response};
|
|
|
|
|
2020-07-12 06:16:41 +08:00
|
|
|
use milli::{BEU32, Index};
|
2020-05-31 23:48:13 +08:00
|
|
|
|
2020-06-11 04:05:01 +08:00
|
|
|
#[cfg(target_os = "linux")]
|
|
|
|
#[global_allocator]
|
|
|
|
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
|
|
|
|
2020-05-31 23:48:13 +08:00
|
|
|
#[derive(Debug, StructOpt)]
|
2020-07-12 17:06:45 +08:00
|
|
|
#[structopt(name = "milli", about = "The server binary of the milli project.")]
|
2020-05-31 23:48:13 +08:00
|
|
|
struct Opt {
|
|
|
|
/// The database path where the LMDB database is located.
|
|
|
|
/// It is created if it doesn't already exist.
|
|
|
|
#[structopt(long = "db", parse(from_os_str))]
|
|
|
|
database: PathBuf,
|
|
|
|
|
|
|
|
/// The maximum size the database can take on disk. It is recommended to specify
|
|
|
|
/// the whole disk space (value must be a multiple of a page size).
|
|
|
|
#[structopt(long = "db-size", default_value = "107374182400")] // 100 GB
|
|
|
|
database_size: usize,
|
|
|
|
|
2020-07-14 17:27:46 +08:00
|
|
|
/// Disable document highlighting on the dashboard.
|
|
|
|
#[structopt(long)]
|
|
|
|
disable_highlighting: bool,
|
|
|
|
|
2020-07-12 17:04:35 +08:00
|
|
|
/// Verbose mode (-v, -vv, -vvv, etc.)
|
|
|
|
#[structopt(short, long, parse(from_occurrences))]
|
|
|
|
verbose: usize,
|
|
|
|
|
2020-05-31 23:48:13 +08:00
|
|
|
/// The ip and port on which the database will listen for HTTP requests.
|
|
|
|
#[structopt(short = "l", long, default_value = "127.0.0.1:9700")]
|
|
|
|
http_listen_addr: String,
|
|
|
|
}
|
|
|
|
|
2020-07-11 20:17:37 +08:00
|
|
|
#[derive(Template)]
|
|
|
|
#[template(path = "index.html")]
|
|
|
|
struct IndexTemplate {
|
|
|
|
db_name: String,
|
|
|
|
db_size: usize,
|
|
|
|
docs_count: usize,
|
|
|
|
}
|
|
|
|
|
2020-05-31 23:48:13 +08:00
|
|
|
#[tokio::main]
|
|
|
|
async fn main() -> anyhow::Result<()> {
|
|
|
|
let opt = Opt::from_args();
|
|
|
|
|
2020-07-12 17:04:35 +08:00
|
|
|
stderrlog::new()
|
|
|
|
.verbosity(opt.verbose)
|
|
|
|
.show_level(false)
|
|
|
|
.timestamp(stderrlog::Timestamp::Off)
|
|
|
|
.init()?;
|
|
|
|
|
2020-05-31 23:48:13 +08:00
|
|
|
std::fs::create_dir_all(&opt.database)?;
|
|
|
|
let env = EnvOpenOptions::new()
|
|
|
|
.map_size(opt.database_size)
|
|
|
|
.max_dbs(10)
|
|
|
|
.open(&opt.database)?;
|
|
|
|
|
|
|
|
let index = Index::new(&env)?;
|
|
|
|
|
2020-07-11 20:51:59 +08:00
|
|
|
// Retrieve the database the file stem (w/o the extension),
|
|
|
|
// the disk file size and the number of documents in the database.
|
2020-07-11 20:17:37 +08:00
|
|
|
let db_name = opt.database.file_stem().and_then(|s| s.to_str()).unwrap_or("").to_string();
|
|
|
|
let db_size = File::open(opt.database.join("data.mdb"))?.metadata()?.len() as usize;
|
2020-07-11 20:51:59 +08:00
|
|
|
let docs_count = env.read_txn().and_then(|r| index.documents.len(&r))?;
|
2020-07-11 20:17:37 +08:00
|
|
|
|
2020-05-31 23:48:13 +08:00
|
|
|
// We run and wait on the HTTP server
|
|
|
|
|
|
|
|
// Expose an HTML page to debug the search in a browser
|
|
|
|
let dash_html_route = warp::filters::method::get()
|
|
|
|
.and(warp::filters::path::end())
|
2020-07-11 20:17:37 +08:00
|
|
|
.map(move || {
|
|
|
|
IndexTemplate {
|
|
|
|
db_name: db_name.clone(),
|
|
|
|
db_size,
|
|
|
|
docs_count,
|
|
|
|
}
|
|
|
|
});
|
2020-05-31 23:48:13 +08:00
|
|
|
|
|
|
|
let dash_bulma_route = warp::filters::method::get()
|
|
|
|
.and(warp::path!("bulma.min.css"))
|
|
|
|
.map(|| Response::builder()
|
|
|
|
.header("content-type", "text/css; charset=utf-8")
|
|
|
|
.body(include_str!("../../public/bulma.min.css"))
|
|
|
|
);
|
|
|
|
|
2020-07-14 05:51:41 +08:00
|
|
|
let dash_bulma_dark_route = warp::filters::method::get()
|
|
|
|
.and(warp::path!("bulma-prefers-dark.min.css"))
|
|
|
|
.map(|| Response::builder()
|
|
|
|
.header("content-type", "text/css; charset=utf-8")
|
|
|
|
.body(include_str!("../../public/bulma-prefers-dark.min.css"))
|
|
|
|
);
|
|
|
|
|
2020-07-11 17:48:27 +08:00
|
|
|
let dash_style_route = warp::filters::method::get()
|
|
|
|
.and(warp::path!("style.css"))
|
|
|
|
.map(|| Response::builder()
|
|
|
|
.header("content-type", "text/css; charset=utf-8")
|
|
|
|
.body(include_str!("../../public/style.css"))
|
|
|
|
);
|
|
|
|
|
2020-05-31 23:48:13 +08:00
|
|
|
let dash_jquery_route = warp::filters::method::get()
|
|
|
|
.and(warp::path!("jquery-3.4.1.min.js"))
|
|
|
|
.map(|| Response::builder()
|
|
|
|
.header("content-type", "application/javascript; charset=utf-8")
|
|
|
|
.body(include_str!("../../public/jquery-3.4.1.min.js"))
|
|
|
|
);
|
|
|
|
|
|
|
|
let dash_papaparse_route = warp::filters::method::get()
|
|
|
|
.and(warp::path!("papaparse.min.js"))
|
|
|
|
.map(|| Response::builder()
|
|
|
|
.header("content-type", "application/javascript; charset=utf-8")
|
|
|
|
.body(include_str!("../../public/papaparse.min.js"))
|
|
|
|
);
|
|
|
|
|
2020-07-11 20:17:37 +08:00
|
|
|
let dash_filesize_route = warp::filters::method::get()
|
|
|
|
.and(warp::path!("filesize.min.js"))
|
|
|
|
.map(|| Response::builder()
|
|
|
|
.header("content-type", "application/javascript; charset=utf-8")
|
|
|
|
.body(include_str!("../../public/filesize.min.js"))
|
|
|
|
);
|
|
|
|
|
2020-07-11 17:48:27 +08:00
|
|
|
let dash_script_route = warp::filters::method::get()
|
|
|
|
.and(warp::path!("script.js"))
|
|
|
|
.map(|| Response::builder()
|
|
|
|
.header("content-type", "application/javascript; charset=utf-8")
|
|
|
|
.body(include_str!("../../public/script.js"))
|
|
|
|
);
|
|
|
|
|
2020-05-31 23:48:13 +08:00
|
|
|
#[derive(Deserialize)]
|
|
|
|
struct QueryBody {
|
|
|
|
query: String,
|
|
|
|
}
|
|
|
|
|
|
|
|
let env_cloned = env.clone();
|
2020-07-14 17:27:46 +08:00
|
|
|
let disable_highlighting = opt.disable_highlighting;
|
2020-05-31 23:48:13 +08:00
|
|
|
let query_route = warp::filters::method::post()
|
|
|
|
.and(warp::path!("query"))
|
|
|
|
.and(warp::body::json())
|
|
|
|
.map(move |query: QueryBody| {
|
|
|
|
let before_search = Instant::now();
|
|
|
|
let rtxn = env_cloned.read_txn().unwrap();
|
|
|
|
|
2020-07-14 17:51:21 +08:00
|
|
|
let (words, documents_ids) = index.search(&rtxn, &query.query).unwrap();
|
2020-05-31 23:48:13 +08:00
|
|
|
|
|
|
|
let mut body = Vec::new();
|
|
|
|
if let Some(headers) = index.headers(&rtxn).unwrap() {
|
|
|
|
// We write the headers
|
|
|
|
body.extend_from_slice(headers);
|
|
|
|
|
2020-07-14 17:51:21 +08:00
|
|
|
let mut regex = format!(r"(?i)\b(");
|
|
|
|
let number_of_words = words.len();
|
|
|
|
words.into_iter().enumerate().for_each(|(i, w)| {
|
|
|
|
regex.push_str(&w);
|
|
|
|
if i != number_of_words - 1 { regex.push('|') }
|
|
|
|
});
|
|
|
|
regex.push_str(r")\b");
|
|
|
|
let re = Regex::new(®ex).unwrap();
|
2020-07-14 17:27:46 +08:00
|
|
|
|
2020-05-31 23:48:13 +08:00
|
|
|
for id in documents_ids {
|
2020-06-11 17:55:03 +08:00
|
|
|
let content = index.documents.get(&rtxn, &BEU32::new(id)).unwrap();
|
|
|
|
let content = content.expect(&format!("could not find document {}", id));
|
2020-07-14 17:27:46 +08:00
|
|
|
let content = std::str::from_utf8(content).unwrap();
|
|
|
|
|
|
|
|
let content = if disable_highlighting {
|
|
|
|
Cow::from(content)
|
|
|
|
} else {
|
|
|
|
re.replace_all(content, "<mark>$1</mark>")
|
|
|
|
};
|
|
|
|
|
|
|
|
body.extend_from_slice(content.as_bytes());
|
2020-05-31 23:48:13 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Response::builder()
|
|
|
|
.header("Content-Type", "text/csv")
|
|
|
|
.header("Time-Ms", before_search.elapsed().as_millis().to_string())
|
|
|
|
.body(String::from_utf8(body).unwrap())
|
|
|
|
});
|
|
|
|
|
|
|
|
let routes = dash_html_route
|
|
|
|
.or(dash_bulma_route)
|
2020-07-14 05:51:41 +08:00
|
|
|
.or(dash_bulma_dark_route)
|
2020-07-11 17:48:27 +08:00
|
|
|
.or(dash_style_route)
|
2020-05-31 23:48:13 +08:00
|
|
|
.or(dash_jquery_route)
|
|
|
|
.or(dash_papaparse_route)
|
2020-07-11 20:17:37 +08:00
|
|
|
.or(dash_filesize_route)
|
2020-07-11 17:48:27 +08:00
|
|
|
.or(dash_script_route)
|
2020-05-31 23:48:13 +08:00
|
|
|
.or(query_route);
|
|
|
|
|
|
|
|
let addr = SocketAddr::from_str(&opt.http_listen_addr).unwrap();
|
|
|
|
warp::serve(routes).run(addr).await;
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
}
|