Support multi-vector images
This commit is contained in:
parent
149302e43d
commit
dfc4b4a4bd
6
Cargo.lock
generated
6
Cargo.lock
generated
|
@ -386,9 +386,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "rustix"
|
||||
version = "0.38.1"
|
||||
version = "0.38.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fbc6396159432b5c8490d4e301d8c705f61860b8b6c863bf79942ce5401968f3"
|
||||
checksum = "aabcb0461ebd01d6b79945797c27f8529082226cb630a9865a71870ff63532a4"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"errno",
|
||||
|
@ -478,7 +478,7 @@ checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
|
|||
[[package]]
|
||||
name = "vectordb"
|
||||
version = "0.1.0"
|
||||
source = "git+https://git.asonix.dog/asonix/vectordb#d5f2e45fe33717f73fdafdeb5e056072c2ff9188"
|
||||
source = "git+https://git.asonix.dog/asonix/vectordb#23fe7c38a6a295678f5a641fb189ce7368a6daf6"
|
||||
dependencies = [
|
||||
"rand",
|
||||
"rayon",
|
||||
|
|
|
@ -10,20 +10,20 @@ pub(super) enum MagickError {
|
|||
|
||||
pub(super) fn identify_grayscale<P: AsRef<std::path::Path>>(
|
||||
path: P,
|
||||
) -> Result<MagickJson, MagickError> {
|
||||
) -> Result<Vec<MagickJson>, MagickError> {
|
||||
identify(path, &["-grayscale", "Rec709Luminance"])
|
||||
}
|
||||
|
||||
pub(super) fn identify_fullcolor<P: AsRef<std::path::Path>>(
|
||||
path: P,
|
||||
) -> Result<MagickJson, MagickError> {
|
||||
) -> Result<Vec<MagickJson>, MagickError> {
|
||||
identify(path, &[])
|
||||
}
|
||||
|
||||
fn identify<P: AsRef<std::path::Path>>(
|
||||
path: P,
|
||||
extra_args: &[&'static str],
|
||||
) -> Result<MagickJson, MagickError> {
|
||||
) -> Result<Vec<MagickJson>, MagickError> {
|
||||
let output = std::process::Command::new("magick")
|
||||
.args(&["convert", "-moments", "-auto-orient", "-colorspace", "sRGB"])
|
||||
.args(extra_args)
|
||||
|
@ -31,7 +31,7 @@ fn identify<P: AsRef<std::path::Path>>(
|
|||
.arg("json:")
|
||||
.output()?;
|
||||
|
||||
let [json]: [_; 1] = match serde_json::from_slice(&output.stdout) {
|
||||
let json: Vec<_> = match serde_json::from_slice(&output.stdout) {
|
||||
Ok(json) => json,
|
||||
Err(e) => {
|
||||
eprintln!(
|
||||
|
|
|
@ -27,7 +27,7 @@ pub struct ImageJson {
|
|||
pub base_depth: u64,
|
||||
pub channel_depth: HashMap<ChannelName, u64>,
|
||||
pub pixels: u64,
|
||||
pub image_statistics: Option<StatisticsJson>,
|
||||
pub image_statistics: Option<MaybeOverall>,
|
||||
pub channel_statistics: HashMap<ChannelName, StatisticsJson>,
|
||||
pub channel_moments: HashMap<ChannelName, ChannelMomentsJson>,
|
||||
pub channel_perceptual_hash: ChannelPerceptualHashJson,
|
||||
|
@ -92,6 +92,16 @@ pub enum ChannelName {
|
|||
Blue,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, PartialOrd, serde::Deserialize)]
|
||||
#[serde(untagged)]
|
||||
pub enum MaybeOverall {
|
||||
Overall {
|
||||
#[serde(rename = "Overall")]
|
||||
overall: StatisticsJson,
|
||||
},
|
||||
Statistics(StatisticsJson),
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, PartialOrd, serde::Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct StatisticsJson {
|
||||
|
|
331
src/main.rs
331
src/main.rs
|
@ -2,7 +2,10 @@ use std::path::Path;
|
|||
|
||||
use args::Args;
|
||||
use clap::Parser;
|
||||
use magick::{json::MagickJson, MagickError};
|
||||
use magick::{
|
||||
json::{MagickJson, PerceptualHashJson},
|
||||
MagickError,
|
||||
};
|
||||
use rayon::prelude::{IntoParallelRefIterator, ParallelIterator};
|
||||
use repo::{ImageRepo, RepoError};
|
||||
use vectordb::{TreeError, Vector, VectorDb};
|
||||
|
@ -12,8 +15,24 @@ mod magick;
|
|||
mod repo;
|
||||
|
||||
struct State {
|
||||
vectordb: VectorDb<7>,
|
||||
repo: ImageRepo,
|
||||
grayscale: VectorDb<7>,
|
||||
fullcolor: VectorDb<42>,
|
||||
grayscale_repo: ImageRepo,
|
||||
fullcolor_repo: ImageRepo,
|
||||
}
|
||||
|
||||
struct SimilarImages {
|
||||
grayscale: Vec<(f32, String)>,
|
||||
fullcolor: Vec<(f32, String)>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
enum Error {
|
||||
Io(std::io::Error),
|
||||
Magick(MagickError),
|
||||
Repo(RepoError),
|
||||
Vector(TreeError),
|
||||
PathString,
|
||||
}
|
||||
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
|
@ -21,14 +40,22 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
|
||||
let db_directory = Path::new("./repo");
|
||||
|
||||
let vectordb = VectorDb::<7>::open(db_directory.join("vectordb"), 4)?;
|
||||
let repo = ImageRepo::open(db_directory.join("image-repo"))?;
|
||||
let grayscale = VectorDb::open(db_directory.join("grayscale"), 4)?;
|
||||
let fullcolor = VectorDb::open(db_directory.join("fullcolor"), 4)?;
|
||||
let grayscale_repo = ImageRepo::open(db_directory.join("grayscale-repo"))?;
|
||||
let fullcolor_repo = ImageRepo::open(db_directory.join("fullcolor-repo"))?;
|
||||
|
||||
if args.rebuild {
|
||||
vectordb.rebuild_all_hypertrees()?;
|
||||
grayscale.rebuild_all_hypertrees()?;
|
||||
fullcolor.rebuild_all_hypertrees()?;
|
||||
}
|
||||
|
||||
let state = State { vectordb, repo };
|
||||
let state = State {
|
||||
grayscale,
|
||||
fullcolor,
|
||||
grayscale_repo,
|
||||
fullcolor_repo,
|
||||
};
|
||||
|
||||
for path in &args.index {
|
||||
if !path.is_absolute() {
|
||||
|
@ -50,7 +77,19 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
return Ok(());
|
||||
}
|
||||
|
||||
for (score, path) in state.find_similar_images(query)? {
|
||||
let SimilarImages {
|
||||
grayscale,
|
||||
fullcolor,
|
||||
} = state.find_similar_images(query)?;
|
||||
|
||||
println!("Grayscale similarities:");
|
||||
for (score, path) in grayscale {
|
||||
println!("score {score:.5}: {path}");
|
||||
}
|
||||
|
||||
println!();
|
||||
println!("Fullcolor similarities:");
|
||||
for (score, path) in fullcolor {
|
||||
println!("score {score:.5}: {path}");
|
||||
}
|
||||
}
|
||||
|
@ -58,67 +97,188 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
fn create_vector(json: &MagickJson) -> Option<Vector<7>> {
|
||||
let Some(hashes) = json.image.channel_perceptual_hash.hashes.get("Channel0") else {
|
||||
return None;
|
||||
};
|
||||
fn to_array(values: &PerceptualHashJson) -> [[f32; 2]; 7] {
|
||||
[
|
||||
values.ph1, values.ph2, values.ph3, values.ph4, values.ph5, values.ph6, values.ph7,
|
||||
]
|
||||
}
|
||||
|
||||
let vector = Vector::from([
|
||||
hashes.ph1[0],
|
||||
hashes.ph2[0],
|
||||
hashes.ph3[0],
|
||||
hashes.ph4[0],
|
||||
hashes.ph5[0],
|
||||
hashes.ph6[0],
|
||||
hashes.ph7[0],
|
||||
]);
|
||||
fn create_fullcolor_vectors(json: &[MagickJson]) -> Vec<Vector<42>> {
|
||||
json.iter()
|
||||
.filter_map(|json| {
|
||||
let channel0 = to_array(json.image.channel_perceptual_hash.hashes.get("Channel0")?);
|
||||
let channel1 = to_array(json.image.channel_perceptual_hash.hashes.get("Channel1")?);
|
||||
let channel2 = to_array(json.image.channel_perceptual_hash.hashes.get("Channel2")?);
|
||||
|
||||
Some(vector)
|
||||
let array: [f32; 42] = channel0
|
||||
.into_iter()
|
||||
.chain(channel1.into_iter())
|
||||
.chain(channel2.into_iter())
|
||||
.flatten()
|
||||
.collect::<Vec<_>>()
|
||||
.try_into()
|
||||
.expect("Correct dimensionality");
|
||||
|
||||
Some(Vector::from(array))
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn create_grayscale_vectors(json: &[MagickJson]) -> Vec<Vector<7>> {
|
||||
json.iter()
|
||||
.filter_map(|json| {
|
||||
if json.image.type_ != "Grayscale" {
|
||||
return None;
|
||||
}
|
||||
|
||||
let hashes = to_array(json.image.channel_perceptual_hash.hashes.get("Channel0")?);
|
||||
|
||||
let vector = Vector::from(hashes.map(|[first, _]| first));
|
||||
|
||||
Some(vector)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn find_similar_images<const N: usize>(
|
||||
vectordb: &VectorDb<N>,
|
||||
repo: &ImageRepo,
|
||||
threshold: Option<f32>,
|
||||
vectors: &[Vector<N>],
|
||||
) -> Result<Vec<(f32, String)>, Error> {
|
||||
let out = vectors
|
||||
.par_iter()
|
||||
.map(|vector| {
|
||||
let similar_vectors = vectordb.find_similarities(&vector, threshold, 10)?;
|
||||
|
||||
let mut output = Vec::with_capacity(vectors.len());
|
||||
|
||||
for similar_vector_id in similar_vectors {
|
||||
let Some(similar_vector) = vectordb.get_vector(similar_vector_id)? else {
|
||||
continue;
|
||||
};
|
||||
|
||||
let similarity = vector.squared_euclidean_distance(&similar_vector);
|
||||
|
||||
let Some(name) = repo.get_name(similar_vector_id)? else {
|
||||
continue;
|
||||
};
|
||||
|
||||
output.push((similarity, name));
|
||||
}
|
||||
|
||||
Ok(output)
|
||||
})
|
||||
.collect::<Result<Vec<Vec<_>>, Error>>()?
|
||||
.into_iter()
|
||||
.flatten()
|
||||
.collect();
|
||||
|
||||
Ok(out)
|
||||
}
|
||||
|
||||
fn insert_grayscale_vectors(
|
||||
vectordb: &VectorDb<7>,
|
||||
repo: &ImageRepo,
|
||||
name: &str,
|
||||
json: &[MagickJson],
|
||||
) -> Result<(), Error> {
|
||||
let vectors = create_grayscale_vectors(json);
|
||||
|
||||
if vectors.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let ids = vectordb.insert_many_vectors(&vectors)?;
|
||||
|
||||
repo.add_image(name, &ids)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn insert_fullcolor_vectors(
|
||||
vectordb: &VectorDb<42>,
|
||||
repo: &ImageRepo,
|
||||
name: &str,
|
||||
json: &[MagickJson],
|
||||
) -> Result<(), Error> {
|
||||
let vectors = create_fullcolor_vectors(json);
|
||||
|
||||
if vectors.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let ids = vectordb.insert_many_vectors(&vectors)?;
|
||||
|
||||
repo.add_image(name, &ids)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_existing_vectors<const N: usize>(
|
||||
vectordb: &VectorDb<N>,
|
||||
repo: &ImageRepo,
|
||||
name: &str,
|
||||
) -> Result<Vec<Vector<N>>, Error> {
|
||||
let vector_ids = repo.get_vectors(name)?;
|
||||
|
||||
let mut out = Vec::with_capacity(vector_ids.len());
|
||||
|
||||
for id in vector_ids {
|
||||
if let Some(vector) = vectordb.get_vector(id)? {
|
||||
out.push(vector);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(out)
|
||||
}
|
||||
|
||||
impl State {
|
||||
fn find_similar_images(&self, path: &Path) -> Result<Vec<(f32, String)>, Error> {
|
||||
fn find_similar_images(&self, path: &Path) -> Result<SimilarImages, Error> {
|
||||
let name = path.to_str().ok_or_else(|| Error::PathString)?;
|
||||
|
||||
let vector = if let Some(vector_id) = self.repo.get_vector(name)? {
|
||||
self.vectordb.get_vector(vector_id)?
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let mut existing_fullcolor =
|
||||
get_existing_vectors(&self.fullcolor, &self.fullcolor_repo, name)?;
|
||||
|
||||
let vector = if let Some(vector) = vector {
|
||||
println!("Fetched vector {:?}", vector);
|
||||
vector
|
||||
} else {
|
||||
let json = magick::identify_grayscale(path)?;
|
||||
let mut existing_grayscale =
|
||||
get_existing_vectors(&self.grayscale, &self.grayscale_repo, name)?;
|
||||
|
||||
let Some(vector) = create_vector(&json) else {
|
||||
return Ok(vec![]);
|
||||
};
|
||||
println!("Computed vector {:?}", vector);
|
||||
if existing_fullcolor.is_empty() {
|
||||
let json = magick::identify_fullcolor(path)?;
|
||||
|
||||
vector
|
||||
};
|
||||
|
||||
let vectors = self.vectordb.find_similarities(&vector, 10)?;
|
||||
|
||||
let mut output = Vec::with_capacity(vectors.len());
|
||||
|
||||
for vector_id in vectors {
|
||||
let Some(similar_vector) = self.vectordb.get_vector(vector_id)? else {
|
||||
continue;
|
||||
};
|
||||
|
||||
let similarity = vector.squared_euclidean_distance(&similar_vector);
|
||||
|
||||
let Some(name) = self.repo.get_name(vector_id)? else {
|
||||
continue;
|
||||
};
|
||||
|
||||
output.push((similarity, name));
|
||||
for json in json {
|
||||
if json.image.type_ == "Grayscale" {
|
||||
existing_grayscale.extend(create_grayscale_vectors(&[json]));
|
||||
} else {
|
||||
existing_fullcolor.extend(create_fullcolor_vectors(&[json]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(output)
|
||||
if existing_grayscale.is_empty() {
|
||||
let json = magick::identify_grayscale(path)?;
|
||||
|
||||
existing_grayscale.extend(create_grayscale_vectors(&json));
|
||||
}
|
||||
|
||||
let grayscale = find_similar_images(
|
||||
&self.grayscale,
|
||||
&self.grayscale_repo,
|
||||
Some(0.18),
|
||||
&existing_grayscale,
|
||||
)?;
|
||||
|
||||
let fullcolor = find_similar_images(
|
||||
&self.fullcolor,
|
||||
&self.fullcolor_repo,
|
||||
Some(20.0),
|
||||
&existing_fullcolor,
|
||||
)?;
|
||||
|
||||
Ok(SimilarImages {
|
||||
grayscale,
|
||||
fullcolor,
|
||||
})
|
||||
}
|
||||
|
||||
fn visit_path(&self, path: &Path) -> Result<(), Error> {
|
||||
|
@ -147,15 +307,15 @@ impl State {
|
|||
fn visit_file(&self, path: &Path) -> Result<(), Error> {
|
||||
let name = path.to_str().ok_or_else(|| Error::PathString)?;
|
||||
|
||||
if name.ends_with("gif") || name.ends_with("xcf") || name.ends_with("psd") {
|
||||
if name.ends_with("xcf") || name.ends_with("psd") {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if self.repo.image_exists(name)? {
|
||||
if self.fullcolor_repo.image_exists(name)? && self.grayscale_repo.image_exists(name)? {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let json = match magick::identify_grayscale(path) {
|
||||
let json = match magick::identify_fullcolor(path) {
|
||||
Ok(json) => json,
|
||||
Err(e) => {
|
||||
eprintln!("{e}");
|
||||
|
@ -164,27 +324,50 @@ impl State {
|
|||
}
|
||||
};
|
||||
|
||||
let Some(vector) = create_vector(&json) else {
|
||||
return Ok(());
|
||||
};
|
||||
let (grayscale_json, fullcolor_json): (Vec<Option<MagickJson>>, Vec<Option<MagickJson>>) =
|
||||
json.into_iter()
|
||||
.map(|json| {
|
||||
if json.image.type_ == "Grayscale" {
|
||||
(Some(json), None)
|
||||
} else {
|
||||
(None, Some(json))
|
||||
}
|
||||
})
|
||||
.unzip();
|
||||
|
||||
let id = self.vectordb.insert_vector(&vector)?;
|
||||
let grayscale_json: Vec<_> = grayscale_json.into_iter().filter_map(|opt| opt).collect();
|
||||
let fullcolor_json: Vec<_> = fullcolor_json.into_iter().filter_map(|opt| opt).collect();
|
||||
|
||||
self.repo.add_image(name, id)?;
|
||||
if !grayscale_json.is_empty() && !self.grayscale_repo.image_exists(name)? {
|
||||
insert_grayscale_vectors(&self.grayscale, &self.grayscale_repo, name, &grayscale_json)?;
|
||||
}
|
||||
|
||||
if !fullcolor_json.is_empty() {
|
||||
if !self.fullcolor_repo.image_exists(name)? {
|
||||
insert_fullcolor_vectors(
|
||||
&self.fullcolor,
|
||||
&self.fullcolor_repo,
|
||||
name,
|
||||
&fullcolor_json,
|
||||
)?;
|
||||
}
|
||||
|
||||
let json = match magick::identify_grayscale(path) {
|
||||
Ok(json) => json,
|
||||
Err(e) => {
|
||||
eprintln!("{e}");
|
||||
eprintln!("{e:?}");
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
|
||||
insert_grayscale_vectors(&self.grayscale, &self.grayscale_repo, name, &json)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
enum Error {
|
||||
Io(std::io::Error),
|
||||
Magick(MagickError),
|
||||
Repo(RepoError),
|
||||
Vector(TreeError),
|
||||
PathString,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Error {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
|
|
37
src/repo.rs
37
src/repo.rs
|
@ -1,6 +1,8 @@
|
|||
use std::path::Path;
|
||||
|
||||
use redb::{Database, ReadableTable, TableDefinition};
|
||||
use redb::{
|
||||
Database, MultimapTableDefinition, ReadableMultimapTable, ReadableTable, TableDefinition,
|
||||
};
|
||||
use vectordb::VectorId;
|
||||
|
||||
pub(super) struct ImageRepo {
|
||||
|
@ -18,8 +20,8 @@ pub(super) enum RepoError {
|
|||
Transaction(redb::TransactionError),
|
||||
}
|
||||
|
||||
const IMAGE_TABLE: TableDefinition<'static, &'static str, VectorId> =
|
||||
TableDefinition::new("image_repo::image_table");
|
||||
const IMAGE_MULTIMAP: MultimapTableDefinition<'static, &'static str, VectorId> =
|
||||
MultimapTableDefinition::new("image_repo::image_multimap");
|
||||
|
||||
const INVERSE_IMAGE_TABLE: TableDefinition<'static, VectorId, &'static str> =
|
||||
TableDefinition::new("image_repo::inverse_image_table");
|
||||
|
@ -34,7 +36,7 @@ impl ImageRepo {
|
|||
database.compact()?;
|
||||
|
||||
let txn = database.begin_write()?;
|
||||
txn.open_table(IMAGE_TABLE)?;
|
||||
txn.open_multimap_table(IMAGE_MULTIMAP)?;
|
||||
txn.open_table(INVERSE_IMAGE_TABLE)?;
|
||||
txn.commit()?;
|
||||
|
||||
|
@ -56,34 +58,39 @@ impl ImageRepo {
|
|||
pub(super) fn image_exists(&self, image: &str) -> Result<bool, RepoError> {
|
||||
let txn = self.database.begin_read()?;
|
||||
|
||||
let table = txn.open_table(IMAGE_TABLE)?;
|
||||
let table = txn.open_multimap_table(IMAGE_MULTIMAP)?;
|
||||
|
||||
let b = table.get(image)?.is_some();
|
||||
let b = table.get(image)?.next().is_some();
|
||||
|
||||
Ok(b)
|
||||
}
|
||||
|
||||
pub(super) fn get_vector(&self, image: &str) -> Result<Option<VectorId>, RepoError> {
|
||||
pub(super) fn get_vectors(&self, image: &str) -> Result<Vec<VectorId>, RepoError> {
|
||||
let txn = self.database.begin_read()?;
|
||||
|
||||
let table = txn.open_table(IMAGE_TABLE)?;
|
||||
let table = txn.open_multimap_table(IMAGE_MULTIMAP)?;
|
||||
|
||||
let opt = table.get(image)?.map(|value| value.value());
|
||||
let vec = table
|
||||
.get(image)?
|
||||
.map(|res| res.map(|value| value.value()))
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
|
||||
Ok(opt)
|
||||
Ok(vec)
|
||||
}
|
||||
|
||||
pub(super) fn add_image(&self, image: &str, vector_id: VectorId) -> Result<(), RepoError> {
|
||||
pub(super) fn add_image(&self, image: &str, vector_ids: &[VectorId]) -> Result<(), RepoError> {
|
||||
let txn = self.database.begin_write()?;
|
||||
|
||||
let mut image_table = txn.open_table(IMAGE_TABLE)?;
|
||||
let mut image_multimap = txn.open_multimap_table(IMAGE_MULTIMAP)?;
|
||||
let mut inverse_image_table = txn.open_table(INVERSE_IMAGE_TABLE)?;
|
||||
|
||||
image_table.insert(image, vector_id)?;
|
||||
inverse_image_table.insert(vector_id, image)?;
|
||||
for vector_id in vector_ids {
|
||||
image_multimap.insert(image, vector_id)?;
|
||||
inverse_image_table.insert(vector_id, image)?;
|
||||
}
|
||||
|
||||
drop(inverse_image_table);
|
||||
drop(image_table);
|
||||
drop(image_multimap);
|
||||
|
||||
txn.commit()?;
|
||||
|
||||
|
|
Loading…
Reference in a new issue