//! The user for this operation requires only write access to the
//! table (not superuser).
//!
+//! Specifying multiple files is more efficient than indexing one file
+//! in each invocation.
+//!
//! ```text
//! $ export PASSWORD=$(gpg -d pw-cvmigrator.gpg)
-//! $ vecsearch index --file testdata/0
-//! indexing a file
-//! Loaded and encoded 59.479µs
-//! Took 14.982262ms
-//! $ vecsearch index --file testdata/1
-//! ...
-//! $ vecsearch index --file testdata/7
-//! ...
+//! $ vecsearch index --file testdata/0 --file testdata/1
+//! indexing file(s)
+//! Loaded and encoded 58.565µs
+//! Took 15.628167ms
+//! Loaded and encoded 55.513µs
+//! Took 8.018493ms
//! ```
//!
//! ## Search
//!
//! ## TODO
//!
-//! - index multiple files
//! - model from main not PR
//! - env support for all args
//!
password: String,
#[arg(long)]
- /// The file containing document contents
- file: String,
+ /// The file containing document contents. Specify multiple
+ file: Vec<String>,
},
/// Search the database for documents matching --search
Ok(())
}
-fn get_embeddings(input: &String, model: BertModel, mut tokenizer: Tokenizer) -> Result<Vec<f32>> {
+fn get_embeddings(input: &String, model: &BertModel, mut tokenizer: Tokenizer) -> Result<Vec<f32>> {
let start = std::time::Instant::now();
let device = &model.device;
let tokenizer = tokenizer
}
fn index(dbname: String, host: String, user: String, password: String,
- file: String, model: BertModel, tokenizer: Tokenizer) -> Result<()> {
-
- println!("indexing a file");
+ files: Vec<String>, model: BertModel, tokenizer: Tokenizer) -> Result<()> {
- let doc_content = std::fs::read_to_string(file)?;
- let embeddings = get_embeddings(&doc_content,model,tokenizer)?;
+ println!("indexing file(s)");
let mut client = postgres::Config::new()
.dbname(&dbname)
.password(password)
.connect(NoTls)?;
- client.execute("INSERT INTO documents (content, embedding) \
- values ($1, $2) \
- ON CONFLICT (content) DO UPDATE SET embedding = $2",
- &[&doc_content,&Vector::from(embeddings)],
- )?;
+ for file in files {
+ let doc_content = std::fs::read_to_string(file)?;
+ let embeddings = get_embeddings(&doc_content,&model,tokenizer.clone())?;
+
+ client.execute("INSERT INTO documents (content, embedding) \
+ values ($1, $2) \
+ ON CONFLICT (content) DO UPDATE SET embedding = $2",
+ &[&doc_content,&Vector::from(embeddings)],
+ )?;
+ }
let _ = client.close();
Ok(())
search: String, model: BertModel, tokenizer: Tokenizer) -> Result<()> {
println!("searching for document matches");
- let embeddings = get_embeddings(&search,model,tokenizer)?;
+ let embeddings = get_embeddings(&search,&model,tokenizer.clone())?;
let mut client = postgres::Config::new()
.dbname(&dbname)