diff --git a/Cargo.lock b/Cargo.lock index 0b900223f..66b5e0147 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -649,12 +649,9 @@ checksum = "d36fab90f82edc3c747f9d438e06cf0a491055896f2a279638bb5beed6c40177" [[package]] name = "hashbrown" -version = "0.8.2" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e91b62f79061a0bc2e046024cb7ba44b08419ed238ecbd9adbd787434b9e8c25" -dependencies = [ - "autocfg 1.0.0", -] +checksum = "d7afe4a420e3fe79967a00898cc1f4db7c8a49a9333a29f8a4bd76a253d5cd04" [[package]] name = "headers" @@ -811,12 +808,13 @@ dependencies = [ [[package]] name = "indexmap" -version = "1.5.1" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86b45e59b16c76b11bf9738fd5d38879d3bd28ad292d7b313608becb17ae2df9" +checksum = "55e2e4c765aa53a0424761bf9f41aa7a6ac1efa87238f59560640e27fca028f2" dependencies = [ "autocfg 1.0.0", "hashbrown", + "serde", ] [[package]] @@ -1014,6 +1012,7 @@ dependencies = [ "grenad", "heed", "human_format", + "indexmap", "itertools", "jemallocator", "levenshtein_automata", diff --git a/Cargo.toml b/Cargo.toml index 6eb2a9fef..264cddc50 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,7 @@ fxhash = "0.2.1" grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "c390cfe" } heed = { version = "0.8.1", default-features = false, features = ["lmdb"] } human_format = "1.0.3" +indexmap = { version = "1.6.0", features = ["serde-1"] } jemallocator = "0.3.2" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } linked-hash-map = "0.5.3" diff --git a/public/papaparse.min.js b/public/papaparse.min.js deleted file mode 100755 index 14c98ff82..000000000 --- a/public/papaparse.min.js +++ /dev/null @@ -1,7 +0,0 @@ -/* @license -Papa Parse -v5.0.2 -https://github.com/mholt/PapaParse -License: MIT -*/ -!function(e,t){"function"==typeof define&&define.amd?define([],t):"object"==typeof module&&"undefined"!=typeof exports?module.exports=t():e.Papa=t()}(this,function s(){"use strict";var f="undefined"!=typeof self?self:"undefined"!=typeof window?window:void 0!==f?f:{};var n=!f.document&&!!f.postMessage,o=n&&/blob:/i.test((f.location||{}).protocol),a={},h=0,b={parse:function(e,t){var r=(t=t||{}).dynamicTyping||!1;q(r)&&(t.dynamicTypingFunction=r,r={});if(t.dynamicTyping=r,t.transform=!!q(t.transform)&&t.transform,t.worker&&b.WORKERS_SUPPORTED){var i=function(){if(!b.WORKERS_SUPPORTED)return!1;var e=(r=f.URL||f.webkitURL||null,i=s.toString(),b.BLOB_URL||(b.BLOB_URL=r.createObjectURL(new Blob(["(",i,")();"],{type:"text/javascript"})))),t=new f.Worker(e);var r,i;return t.onmessage=_,t.id=h++,a[t.id]=t}();return i.userStep=t.step,i.userChunk=t.chunk,i.userComplete=t.complete,i.userError=t.error,t.step=q(t.step),t.chunk=q(t.chunk),t.complete=q(t.complete),t.error=q(t.error),delete t.worker,void i.postMessage({input:e,config:t,workerId:i.id})}var n=null;b.NODE_STREAM_INPUT,"string"==typeof e?n=t.download?new l(t):new p(t):!0===e.readable&&q(e.read)&&q(e.on)?n=new m(t):(f.File&&e instanceof File||e instanceof Object)&&(n=new c(t));return n.stream(e)},unparse:function(e,t){var i=!1,_=!0,g=",",v="\r\n",n='"',s=n+n,r=!1,a=null;!function(){if("object"!=typeof t)return;"string"!=typeof t.delimiter||b.BAD_DELIMITERS.filter(function(e){return-1!==t.delimiter.indexOf(e)}).length||(g=t.delimiter);("boolean"==typeof t.quotes||Array.isArray(t.quotes))&&(i=t.quotes);"boolean"!=typeof t.skipEmptyLines&&"string"!=typeof t.skipEmptyLines||(r=t.skipEmptyLines);"string"==typeof t.newline&&(v=t.newline);"string"==typeof t.quoteChar&&(n=t.quoteChar);"boolean"==typeof t.header&&(_=t.header);if(Array.isArray(t.columns)){if(0===t.columns.length)throw new Error("Option columns is empty");a=t.columns}void 0!==t.escapeChar&&(s=t.escapeChar+n)}();var o=new RegExp(U(n),"g");"string"==typeof e&&(e=JSON.parse(e));if(Array.isArray(e)){if(!e.length||Array.isArray(e[0]))return u(null,e,r);if("object"==typeof e[0])return u(a||h(e[0]),e,r)}else if("object"==typeof e)return"string"==typeof e.data&&(e.data=JSON.parse(e.data)),Array.isArray(e.data)&&(e.fields||(e.fields=e.meta&&e.meta.fields),e.fields||(e.fields=Array.isArray(e.data[0])?e.fields:h(e.data[0])),Array.isArray(e.data[0])||"object"==typeof e.data[0]||(e.data=[e.data])),u(e.fields||[],e.data||[],r);throw new Error("Unable to serialize unrecognized input");function h(e){if("object"!=typeof e)return[];var t=[];for(var r in e)t.push(r);return t}function u(e,t,r){var i="";"string"==typeof e&&(e=JSON.parse(e)),"string"==typeof t&&(t=JSON.parse(t));var n=Array.isArray(e)&&0=this._config.preview;if(o)f.postMessage({results:n,workerId:b.WORKER_ID,finished:a});else if(q(this._config.chunk)&&!t){if(this._config.chunk(n,this._handle),this._handle.paused()||this._handle.aborted())return void(this._halted=!0);n=void 0,this._completeResults=void 0}return this._config.step||this._config.chunk||(this._completeResults.data=this._completeResults.data.concat(n.data),this._completeResults.errors=this._completeResults.errors.concat(n.errors),this._completeResults.meta=n.meta),this._completed||!a||!q(this._config.complete)||n&&n.meta.aborted||(this._config.complete(this._completeResults,this._input),this._completed=!0),a||n&&n.meta.paused||this._nextChunk(),n}this._halted=!0},this._sendError=function(e){q(this._config.error)?this._config.error(e):o&&this._config.error&&f.postMessage({workerId:b.WORKER_ID,error:e,finished:!1})}}function l(e){var i;(e=e||{}).chunkSize||(e.chunkSize=b.RemoteChunkSize),u.call(this,e),this._nextChunk=n?function(){this._readChunk(),this._chunkLoaded()}:function(){this._readChunk()},this.stream=function(e){this._input=e,this._nextChunk()},this._readChunk=function(){if(this._finished)this._chunkLoaded();else{if(i=new XMLHttpRequest,this._config.withCredentials&&(i.withCredentials=this._config.withCredentials),n||(i.onload=y(this._chunkLoaded,this),i.onerror=y(this._chunkError,this)),i.open("GET",this._input,!n),this._config.downloadRequestHeaders){var e=this._config.downloadRequestHeaders;for(var t in e)i.setRequestHeader(t,e[t])}if(this._config.chunkSize){var r=this._start+this._config.chunkSize-1;i.setRequestHeader("Range","bytes="+this._start+"-"+r)}try{i.send()}catch(e){this._chunkError(e.message)}n&&0===i.status?this._chunkError():this._start+=this._config.chunkSize}},this._chunkLoaded=function(){4===i.readyState&&(i.status<200||400<=i.status?this._chunkError():(this._finished=!this._config.chunkSize||this._start>function(e){var t=e.getResponseHeader("Content-Range");if(null===t)return-1;return parseInt(t.substr(t.lastIndexOf("/")+1))}(i),this.parseChunk(i.responseText)))},this._chunkError=function(e){var t=i.statusText||e;this._sendError(new Error(t))}}function c(e){var i,n;(e=e||{}).chunkSize||(e.chunkSize=b.LocalChunkSize),u.call(this,e);var s="undefined"!=typeof FileReader;this.stream=function(e){this._input=e,n=e.slice||e.webkitSlice||e.mozSlice,s?((i=new FileReader).onload=y(this._chunkLoaded,this),i.onerror=y(this._chunkError,this)):i=new FileReaderSync,this._nextChunk()},this._nextChunk=function(){this._finished||this._config.preview&&!(this._rowCount=this._input.size,this.parseChunk(e.target.result)},this._chunkError=function(){this._sendError(i.error)}}function p(e){var r;u.call(this,e=e||{}),this.stream=function(e){return r=e,this._nextChunk()},this._nextChunk=function(){if(!this._finished){var e=this._config.chunkSize,t=e?r.substr(0,e):r;return r=e?r.substr(e):"",this._finished=!r,this.parseChunk(t)}}}function m(e){u.call(this,e=e||{});var t=[],r=!0,i=!1;this.pause=function(){u.prototype.pause.apply(this,arguments),this._input.pause()},this.resume=function(){u.prototype.resume.apply(this,arguments),this._input.resume()},this.stream=function(e){this._input=e,this._input.on("data",this._streamData),this._input.on("end",this._streamEnd),this._input.on("error",this._streamError)},this._checkIsFinished=function(){i&&1===t.length&&(this._finished=!0)},this._nextChunk=function(){this._checkIsFinished(),t.length?this.parseChunk(t.shift()):r=!0},this._streamData=y(function(e){try{t.push("string"==typeof e?e:e.toString(this._config.encoding)),r&&(r=!1,this._checkIsFinished(),this.parseChunk(t.shift()))}catch(e){this._streamError(e)}},this),this._streamError=y(function(e){this._streamCleanUp(),this._sendError(e)},this),this._streamEnd=y(function(){this._streamCleanUp(),i=!0,this._streamData("")},this),this._streamCleanUp=y(function(){this._input.removeListener("data",this._streamData),this._input.removeListener("end",this._streamEnd),this._input.removeListener("error",this._streamError)},this)}function r(g){var a,o,h,i=Math.pow(2,53),n=-i,s=/^\s*-?(\d*\.?\d+|\d+\.?\d*)(e[-+]?\d+)?\s*$/i,u=/(\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d:[0-5]\d\.\d+([+-][0-2]\d:[0-5]\d|Z))|(\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d:[0-5]\d([+-][0-2]\d:[0-5]\d|Z))|(\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d([+-][0-2]\d:[0-5]\d|Z))/,t=this,r=0,f=0,d=!1,e=!1,l=[],c={data:[],errors:[],meta:{}};if(q(g.step)){var p=g.step;g.step=function(e){if(c=e,_())m();else{if(m(),0===c.data.length)return;r+=e.data.length,g.preview&&r>g.preview?o.abort():p(c,t)}}}function v(e){return"greedy"===g.skipEmptyLines?""===e.join("").trim():1===e.length&&0===e[0].length}function m(){if(c&&h&&(k("Delimiter","UndetectableDelimiter","Unable to auto-detect delimiting character; defaulted to '"+b.DefaultDelimiter+"'"),h=!1),g.skipEmptyLines)for(var e=0;e=l.length?"__parsed_extra":l[r]),g.transform&&(s=g.transform(s,n)),s=y(n,s),"__parsed_extra"===n?(i[n]=i[n]||[],i[n].push(s)):i[n]=s}return g.header&&(r>l.length?k("FieldMismatch","TooManyFields","Too many fields: expected "+l.length+" fields but parsed "+r,f+t):r=i.length/2?"\r\n":"\r"}(e,i)),h=!1,g.delimiter)q(g.delimiter)&&(g.delimiter=g.delimiter(e),c.meta.delimiter=g.delimiter);else{var n=function(e,t,r,i,n){var s,a,o,h;n=n||[",","\t","|",";",b.RECORD_SEP,b.UNIT_SEP];for(var u=0;u=L)return R(!0)}else for(g=M,M++;;){if(-1===(g=a.indexOf(O,g+1)))return t||u.push({type:"Quotes",code:"MissingQuotes",message:"Quoted field unterminated",row:h.length,index:M}),w();if(g===i-1)return w(a.substring(M,g).replace(_,O));if(O!==z||a[g+1]!==z){if(O===z||0===g||a[g-1]!==z){var y=E(-1===m?p:Math.min(p,m));if(a[g+1+y]===D){f.push(a.substring(M,g).replace(_,O)),a[M=g+1+y+e]!==O&&(g=a.indexOf(O,M)),p=a.indexOf(D,M),m=a.indexOf(I,M);break}var k=E(m);if(a.substr(g+1+k,n)===I){if(f.push(a.substring(M,g).replace(_,O)),C(g+1+k+n),p=a.indexOf(D,M),g=a.indexOf(O,M),o&&(S(),j))return R();if(L&&h.length>=L)return R(!0);break}u.push({type:"Quotes",code:"InvalidQuotes",message:"Trailing quote on quoted field is malformed",row:h.length,index:M}),g++}}else g++}return w();function b(e){h.push(e),d=M}function E(e){var t=0;if(-1!==e){var r=a.substring(g+1,e);r&&""===r.trim()&&(t=r.length)}return t}function w(e){return t||(void 0===e&&(e=a.substr(M)),f.push(e),M=i,b(f),o&&S()),R()}function C(e){M=e,b(f),f=[],m=a.indexOf(I,M)}function R(e,t){return{data:t||!1?h[0]:h,errors:u,meta:{delimiter:D,linebreak:I,aborted:j,truncated:!!e,cursor:d+(r||0)}}}function S(){A(R(void 0,!0)),h=[],u=[]}function x(e,t,r){var i={nextDelim:void 0,quoteSearch:void 0},n=a.indexOf(O,t+1);if(t, /// Maps the document id to the document as a CSV line. - pub documents: Database, ByteSlice>, + pub documents: Database, ObkvCodec>, } impl Index { @@ -74,23 +74,15 @@ impl Index { pub fn documents<'t>( &self, rtxn: &'t heed::RoTxn, - iter: impl IntoIterator, - ) -> anyhow::Result> + ids: impl IntoIterator, + ) -> anyhow::Result)>> { - let ids: Vec<_> = iter.into_iter().collect(); - let mut content = Vec::new(); + let mut documents = Vec::new(); - for id in ids.iter().cloned() { - let document_content = self.documents.get(rtxn, &BEU32::new(id))? + for id in ids { + let kv = self.documents.get(rtxn, &BEU32::new(id))? .with_context(|| format!("Could not find document {}", id))?; - content.extend_from_slice(document_content); - } - - let mut rdr = csv::ReaderBuilder::new().has_headers(false).from_reader(&content[..]); - - let mut documents = Vec::with_capacity(ids.len()); - for (id, result) in ids.into_iter().zip(rdr.records()) { - documents.push((id, result?)); + documents.push((id, kv)); } Ok(documents) diff --git a/src/indexing/store.rs b/src/indexing/store.rs index 5be54f2eb..29e4c046d 100644 --- a/src/indexing/store.rs +++ b/src/indexing/store.rs @@ -1,5 +1,5 @@ use std::collections::{BTreeMap, HashMap}; -use std::convert::TryFrom; +use std::convert::{TryFrom, TryInto}; use std::fs::File; use std::io::Read; use std::iter::FromIterator; @@ -204,11 +204,15 @@ impl Store { self.insert_word_docid(word, document_id)?; } - let record = CsvStringRecordCodec::bytes_encode(record) - .with_context(|| format!("could not encode CSV record"))?; + let mut writer = obkv::KvWriter::memory(); + record.iter().enumerate().for_each(|(i, v)| { + let key = i.try_into().unwrap(); + writer.insert(key, v.as_bytes()).unwrap(); + }); + let bytes = writer.into_inner().unwrap(); self.documents_ids.insert(document_id); - self.documents_writer.insert(document_id.to_be_bytes(), record)?; + self.documents_writer.insert(document_id.to_be_bytes(), bytes)?; Self::write_docid_word_positions(&mut self.docid_word_positions_writer, document_id, words_positions)?; Ok(()) diff --git a/src/subcommand/search.rs b/src/subcommand/search.rs index 2d2fc2724..f7f7adaeb 100644 --- a/src/subcommand/search.rs +++ b/src/subcommand/search.rs @@ -68,7 +68,7 @@ pub fn run(opt: Opt) -> anyhow::Result<()> { let mut wtr = csv::Writer::from_writer(io::stdout()); wtr.write_record(&headers)?; for (_id, record) in documents { - wtr.write_record(&record)?; + wtr.write_record(record.iter().map(|(_, v)| v))?; } wtr.flush()?; diff --git a/src/subcommand/serve.rs b/src/subcommand/serve.rs index 094ce3059..fdbc60d04 100644 --- a/src/subcommand/serve.rs +++ b/src/subcommand/serve.rs @@ -1,5 +1,6 @@ use std::collections::HashSet; use std::fs::{File, create_dir_all}; +use std::mem; use std::net::SocketAddr; use std::path::PathBuf; use std::str::FromStr; @@ -7,9 +8,10 @@ use std::sync::Arc; use std::time::Instant; use askama_warp::Template; -use futures::{FutureExt, StreamExt}; use futures::stream; +use futures::{FutureExt, StreamExt}; use heed::EnvOpenOptions; +use indexmap::IndexMap; use serde::{Serialize, Deserialize}; use structopt::StructOpt; use tokio::fs::File as TFile; @@ -56,25 +58,21 @@ pub struct Opt { indexer: IndexerOpt, } -fn highlight_record(record: &csv::StringRecord, words: &HashSet) -> csv::StringRecord { - let mut output_record = csv::StringRecord::new(); - let mut buffer = String::new(); - for field in record { - buffer.clear(); - for (token_type, token) in simple_tokenizer(field) { +fn highlight_record(record: &mut IndexMap, words: &HashSet) { + for (_key, value) in record.iter_mut() { + let old_value = mem::take(value); + for (token_type, token) in simple_tokenizer(&old_value) { if token_type == TokenType::Word { let lowercase_token = token.to_lowercase(); let to_highlight = words.contains(&lowercase_token); - if to_highlight { buffer.push_str("") } - buffer.push_str(token); - if to_highlight { buffer.push_str("") } + if to_highlight { value.push_str("") } + value.push_str(token); + if to_highlight { value.push_str("") } } else { - buffer.push_str(token); + value.push_str(token); } } - output_record.push_field(&buffer); } - output_record } #[derive(Template)] @@ -327,13 +325,6 @@ pub fn run(opt: Opt) -> anyhow::Result<()> { .body(include_str!("../../public/jquery-3.4.1.min.js")) ); - let dash_papaparse_route = warp::filters::method::get() - .and(warp::path!("papaparse.min.js")) - .map(|| Response::builder() - .header("content-type", "application/javascript; charset=utf-8") - .body(include_str!("../../public/papaparse.min.js")) - ); - let dash_filesize_route = warp::filters::method::get() .and(warp::path!("filesize.min.js")) .map(|| Response::builder() @@ -390,32 +381,29 @@ pub fn run(opt: Opt) -> anyhow::Result<()> { let SearchResult { found_words, documents_ids } = search.execute().unwrap(); - let body = match index.headers(&rtxn).unwrap() { - Some(headers) => { - let mut wtr = csv::Writer::from_writer(Vec::new()); + let mut documents = Vec::new(); + if let Some(headers) = index.headers(&rtxn).unwrap() { + for (_id, record) in index.documents(&rtxn, documents_ids).unwrap() { + let mut record = record.iter() + .map(|(key_id, value)| { + let key = headers[key_id as usize].to_owned(); + let value = std::str::from_utf8(value).unwrap().to_owned(); + (key, value) + }) + .collect(); - // We write the headers - wtr.write_record(&headers).unwrap(); - - let documents = index.documents(&rtxn, documents_ids).unwrap(); - for (_id, record) in documents { - let record = if disable_highlighting { - record - } else { - highlight_record(&record, &found_words) - }; - wtr.write_record(&record).unwrap(); + if !disable_highlighting { + highlight_record(&mut record, &found_words); } - wtr.into_inner().unwrap() - }, - None => Vec::new(), - }; + documents.push(record); + } + } Response::builder() - .header("Content-Type", "text/csv") + .header("Content-Type", "application/json") .header("Time-Ms", before_search.elapsed().as_millis().to_string()) - .body(String::from_utf8(body).unwrap()) + .body(serde_json::to_string(&documents).unwrap()) }); async fn buf_stream( @@ -504,7 +492,6 @@ pub fn run(opt: Opt) -> anyhow::Result<()> { .or(dash_bulma_dark_route) .or(dash_style_route) .or(dash_jquery_route) - .or(dash_papaparse_route) .or(dash_filesize_route) .or(dash_script_route) .or(updates_script_route) diff --git a/templates/index.html b/templates/index.html index 0fe6034ea..f2161457d 100644 --- a/templates/index.html +++ b/templates/index.html @@ -7,7 +7,6 @@ - {{ db_name }} | The milli engine diff --git a/templates/updates.html b/templates/updates.html index 8ffdae390..271394c92 100644 --- a/templates/updates.html +++ b/templates/updates.html @@ -7,7 +7,6 @@ - {{ db_name }} | Updates