diff --git a/lib/segment/src/data_types/named_vectors.rs b/lib/segment/src/data_types/named_vectors.rs index 9d6eb8149c8..4a1f264ad07 100644 --- a/lib/segment/src/data_types/named_vectors.rs +++ b/lib/segment/src/data_types/named_vectors.rs @@ -89,6 +89,15 @@ impl<'a> From for CowVector<'a> { } } +impl<'a> From> for CowVector<'a> { + fn from(v: Cow<'a, MultiDenseVector>) -> Self { + match v { + Cow::Borrowed(v) => CowVector::MultiDense(Cow::Borrowed(v)), + Cow::Owned(v) => CowVector::MultiDense(Cow::Owned(v)), + } + } +} + impl<'a> From<&'a SparseVector> for CowVector<'a> { fn from(v: &'a SparseVector) -> Self { CowVector::Sparse(Cow::Borrowed(v)) diff --git a/lib/segment/src/data_types/primitive.rs b/lib/segment/src/data_types/primitive.rs index 811d2b9eee6..e811d9ad814 100644 --- a/lib/segment/src/data_types/primitive.rs +++ b/lib/segment/src/data_types/primitive.rs @@ -3,6 +3,7 @@ use std::borrow::Cow; use itertools::Itertools; use serde::{Deserialize, Serialize}; +use super::vectors::TypedMultiDenseVector; use crate::data_types::vectors::{VectorElementType, VectorElementTypeByte}; use crate::spaces::metric::Metric; use crate::spaces::simple::{CosineMetric, DotProductMetric, EuclidMetric, ManhattanMetric}; @@ -22,6 +23,14 @@ pub trait PrimitiveVectorElement: ) -> Cow<'a, [f32]>; fn datatype() -> VectorStorageDatatype; + + fn from_float_multivector( + multivector: Cow>, + ) -> Cow>; + + fn into_float_multivector( + multivector: Cow>, + ) -> Cow>; } impl PrimitiveVectorElement for VectorElementType { @@ -44,6 +53,18 @@ impl PrimitiveVectorElement for VectorElementType { fn datatype() -> VectorStorageDatatype { VectorStorageDatatype::Float32 } + + fn from_float_multivector( + multivector: Cow>, + ) -> Cow> { + multivector + } + + fn into_float_multivector( + multivector: Cow>, + ) -> Cow> { + multivector + } } impl PrimitiveVectorElement for VectorElementTypeByte { @@ -86,4 +107,30 @@ impl PrimitiveVectorElement for VectorElementTypeByte { fn datatype() -> VectorStorageDatatype { VectorStorageDatatype::Uint8 } + + fn from_float_multivector( + multivector: Cow>, + ) -> Cow> { + Cow::Owned(TypedMultiDenseVector::new( + multivector + .inner_vector + .iter() + .map(|&x| x as Self) + .collect_vec(), + multivector.dim, + )) + } + + fn into_float_multivector( + multivector: Cow>, + ) -> Cow> { + Cow::Owned(TypedMultiDenseVector::new( + multivector + .inner_vector + .iter() + .map(|&x| x as VectorElementType) + .collect_vec(), + multivector.dim, + )) + } } diff --git a/lib/segment/src/data_types/vectors.rs b/lib/segment/src/data_types/vectors.rs index 6334c2b6903..12f9d107197 100644 --- a/lib/segment/src/data_types/vectors.rs +++ b/lib/segment/src/data_types/vectors.rs @@ -1,12 +1,14 @@ use std::collections::HashMap; use std::slice::ChunksExactMut; +use itertools::Itertools; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use sparse::common::sparse_vector::SparseVector; use validator::Validate; use super::named_vectors::NamedVectors; +use super::primitive::PrimitiveVectorElement; use crate::common::operation_error::OperationError; use crate::common::utils::transpose_map_into_named_vector; use crate::vector_storage::query::context_query::ContextQuery; @@ -176,13 +178,15 @@ pub type DenseVector = TypedDenseVector; /// Type for multi dense vector #[derive(Debug, Clone, PartialEq, Deserialize, Serialize)] -pub struct MultiDenseVector { - pub inner_vector: DenseVector, // vectors are flattened into a single vector - pub dim: usize, // dimension of each vector +pub struct TypedMultiDenseVector { + pub inner_vector: TypedDenseVector, // vectors are flattened into a single vector + pub dim: usize, // dimension of each vector } -impl MultiDenseVector { - pub fn new(flattened_vectors: DenseVector, dim: usize) -> Self { +pub type MultiDenseVector = TypedMultiDenseVector; + +impl TypedMultiDenseVector { + pub fn new(flattened_vectors: TypedDenseVector, dim: usize) -> Self { Self { inner_vector: flattened_vectors, dim, @@ -192,17 +196,17 @@ impl MultiDenseVector { /// MultiDenseVector cannot be empty, so we use a placeholder vector instead pub fn placeholder(dim: usize) -> Self { Self { - inner_vector: vec![1.0; dim], + inner_vector: vec![Default::default(); dim], dim, } } /// Slices the multi vector into the underlying individual vectors - pub fn multi_vectors(&self) -> impl Iterator { + pub fn multi_vectors(&self) -> impl Iterator { self.inner_vector.chunks_exact(self.dim) } - pub fn multi_vectors_mut(&mut self) -> ChunksExactMut<'_, VectorElementType> { + pub fn multi_vectors_mut(&mut self) -> ChunksExactMut<'_, T> { self.inner_vector.chunks_exact_mut(self.dim) } @@ -211,10 +215,10 @@ impl MultiDenseVector { } } -impl TryFrom> for MultiDenseVector { +impl TryFrom>> for TypedMultiDenseVector { type Error = OperationError; - fn try_from(value: Vec) -> Result { + fn try_from(value: Vec>) -> Result { if value.is_empty() { return Err(OperationError::ValidationError { description: "MultiDenseVector cannot be empty".to_string(), @@ -228,8 +232,8 @@ impl TryFrom> for MultiDenseVector { received_dim: bad_vec.len(), }) } else { - let inner_vector = value.into_iter().flatten().collect(); - let multi_dense = MultiDenseVector { inner_vector, dim }; + let inner_vector = value.into_iter().flatten().collect_vec(); + let multi_dense = TypedMultiDenseVector { inner_vector, dim }; Ok(multi_dense) } } diff --git a/lib/segment/src/vector_storage/query_scorer/multi_custom_query_scorer.rs b/lib/segment/src/vector_storage/query_scorer/multi_custom_query_scorer.rs index fa6811a3fe7..7951e14ba8d 100644 --- a/lib/segment/src/vector_storage/query_scorer/multi_custom_query_scorer.rs +++ b/lib/segment/src/vector_storage/query_scorer/multi_custom_query_scorer.rs @@ -12,7 +12,7 @@ use crate::vector_storage::MultiVectorStorage; pub struct MultiCustomQueryScorer< 'a, TMetric: Metric, - TVectorStorage: MultiVectorStorage, + TVectorStorage: MultiVectorStorage, TQuery: Query, > { vector_storage: &'a TVectorStorage, @@ -23,7 +23,7 @@ pub struct MultiCustomQueryScorer< impl< 'a, TMetric: Metric, - TVectorStorage: MultiVectorStorage, + TVectorStorage: MultiVectorStorage, TQuery: Query + TransformInto, > MultiCustomQueryScorer<'a, TMetric, TVectorStorage, TQuery> { @@ -50,7 +50,7 @@ impl< impl< 'a, TMetric: Metric, - TVectorStorage: MultiVectorStorage, + TVectorStorage: MultiVectorStorage, TQuery: Query, > QueryScorer for MultiCustomQueryScorer<'a, TMetric, TVectorStorage, TQuery> diff --git a/lib/segment/src/vector_storage/query_scorer/multi_metric_query_scorer.rs b/lib/segment/src/vector_storage/query_scorer/multi_metric_query_scorer.rs index 409466941bb..76e1163ad24 100644 --- a/lib/segment/src/vector_storage/query_scorer/multi_metric_query_scorer.rs +++ b/lib/segment/src/vector_storage/query_scorer/multi_metric_query_scorer.rs @@ -11,15 +11,18 @@ use crate::vector_storage::MultiVectorStorage; pub struct MultiMetricQueryScorer< 'a, TMetric: Metric, - TVectorStorage: MultiVectorStorage, + TVectorStorage: MultiVectorStorage, > { vector_storage: &'a TVectorStorage, query: MultiDenseVector, metric: PhantomData, } -impl<'a, TMetric: Metric, TVectorStorage: MultiVectorStorage> - MultiMetricQueryScorer<'a, TMetric, TVectorStorage> +impl< + 'a, + TMetric: Metric, + TVectorStorage: MultiVectorStorage, + > MultiMetricQueryScorer<'a, TMetric, TVectorStorage> { pub fn new(query: MultiDenseVector, vector_storage: &'a TVectorStorage) -> Self { let slices = query.multi_vectors(); @@ -47,8 +50,11 @@ impl<'a, TMetric: Metric, TVectorStorage: MultiVectorStorage> } } -impl<'a, TMetric: Metric, TVectorStorage: MultiVectorStorage> - QueryScorer for MultiMetricQueryScorer<'a, TMetric, TVectorStorage> +impl< + 'a, + TMetric: Metric, + TVectorStorage: MultiVectorStorage, + > QueryScorer for MultiMetricQueryScorer<'a, TMetric, TVectorStorage> { #[inline] fn score_stored(&self, idx: PointOffsetType) -> ScoreType { diff --git a/lib/segment/src/vector_storage/raw_scorer.rs b/lib/segment/src/vector_storage/raw_scorer.rs index 72b203db28d..3cf3ebf5014 100644 --- a/lib/segment/src/vector_storage/raw_scorer.rs +++ b/lib/segment/src/vector_storage/raw_scorer.rs @@ -414,7 +414,7 @@ where })) } -pub fn raw_multi_scorer_impl<'a, TVectorStorage: MultiVectorStorage>( +pub fn raw_multi_scorer_impl<'a, TVectorStorage: MultiVectorStorage>( query: QueryVector, vector_storage: &'a TVectorStorage, point_deleted: &'a BitSlice, @@ -451,7 +451,7 @@ pub fn raw_multi_scorer_impl<'a, TVectorStorage: MultiVectorStorage>( fn new_multi_scorer_with_metric< 'a, TMetric: Metric + 'a, - TVectorStorage: MultiVectorStorage, + TVectorStorage: MultiVectorStorage, >( query: QueryVector, vector_storage: &'a TVectorStorage, diff --git a/lib/segment/src/vector_storage/simple_multi_dense_vector_storage.rs b/lib/segment/src/vector_storage/simple_multi_dense_vector_storage.rs index c2364308676..0e2835e45d9 100644 --- a/lib/segment/src/vector_storage/simple_multi_dense_vector_storage.rs +++ b/lib/segment/src/vector_storage/simple_multi_dense_vector_storage.rs @@ -1,3 +1,4 @@ +use std::borrow::Cow; use std::ops::Range; use std::sync::atomic::AtomicBool; use std::sync::Arc; @@ -12,23 +13,26 @@ use crate::common::operation_error::{check_process_stopped, OperationError, Oper use crate::common::rocksdb_wrapper::DatabaseColumnWrapper; use crate::common::Flusher; use crate::data_types::named_vectors::CowVector; -use crate::data_types::vectors::{MultiDenseVector, VectorRef}; +use crate::data_types::primitive::PrimitiveVectorElement; +use crate::data_types::vectors::{ + MultiDenseVector, TypedMultiDenseVector, VectorElementType, VectorRef, +}; use crate::types::{Distance, MultiVectorConfig, VectorStorageDatatype}; use crate::vector_storage::bitvec::bitvec_set_deleted; use crate::vector_storage::common::StoredRecord; use crate::vector_storage::{MultiVectorStorage, VectorStorage, VectorStorageEnum}; -type StoredMultiDenseVector = StoredRecord; +type StoredMultiDenseVector = StoredRecord>; /// In-memory vector storage with on-update persistence using `store` -pub struct SimpleMultiDenseVectorStorage { +pub struct SimpleMultiDenseVectorStorage { dim: usize, distance: Distance, multi_vector_config: MultiVectorConfig, /// Keep vectors in memory - vectors: Vec, + vectors: Vec>, db_wrapper: DatabaseColumnWrapper, - update_buffer: StoredMultiDenseVector, + update_buffer: StoredMultiDenseVector, /// BitVec for deleted flags. Grows dynamically upto last set flag. deleted: BitVec, /// Current number of deleted vectors. @@ -44,14 +48,14 @@ pub fn open_simple_multi_dense_vector_storage( multi_vector_config: MultiVectorConfig, stopped: &AtomicBool, ) -> OperationResult>> { - let mut vectors: Vec = vec![]; + let mut vectors: Vec> = vec![]; let (mut deleted, mut deleted_count) = (BitVec::new(), 0); let db_wrapper = DatabaseColumnWrapper::new(database, database_column_name); db_wrapper.lock_db().iter()?; for (key, value) in db_wrapper.lock_db().iter()? { let point_id: PointOffsetType = bincode::deserialize(&key) .map_err(|_| OperationError::service_error("cannot deserialize point id from db"))?; - let stored_record: StoredMultiDenseVector = bincode::deserialize(&value) + let stored_record: StoredMultiDenseVector = bincode::deserialize(&value) .map_err(|_| OperationError::service_error("cannot deserialize record from db"))?; // Propagate deleted flag @@ -61,7 +65,7 @@ pub fn open_simple_multi_dense_vector_storage( } let point_id_usize = point_id as usize; if point_id_usize >= vectors.len() { - vectors.resize(point_id_usize + 1, MultiDenseVector::placeholder(dim)); + vectors.resize(point_id_usize + 1, TypedMultiDenseVector::placeholder(dim)); } vectors[point_id_usize] = stored_record.vector; @@ -77,7 +81,7 @@ pub fn open_simple_multi_dense_vector_storage( db_wrapper, update_buffer: StoredMultiDenseVector { deleted: false, - vector: MultiDenseVector::placeholder(dim), + vector: TypedMultiDenseVector::placeholder(dim), }, deleted, deleted_count, @@ -85,7 +89,7 @@ pub fn open_simple_multi_dense_vector_storage( ))) } -impl SimpleMultiDenseVectorStorage { +impl SimpleMultiDenseVectorStorage { /// Set deleted flag for given key. Returns previous deleted state. #[inline] fn set_deleted(&mut self, key: PointOffsetType, deleted: bool) -> bool { @@ -107,7 +111,7 @@ impl SimpleMultiDenseVectorStorage { &mut self, key: PointOffsetType, deleted: bool, - vector: Option, + vector: Option>, ) -> OperationResult<()> { // Write vector state to buffer record let record = &mut self.update_buffer; @@ -126,8 +130,8 @@ impl SimpleMultiDenseVectorStorage { } } -impl MultiVectorStorage for SimpleMultiDenseVectorStorage { - fn get_multi(&self, key: PointOffsetType) -> &MultiDenseVector { +impl MultiVectorStorage for SimpleMultiDenseVectorStorage { + fn get_multi(&self, key: PointOffsetType) -> &TypedMultiDenseVector { self.vectors.get(key as usize).expect("vector not found") } @@ -136,7 +140,7 @@ impl MultiVectorStorage for SimpleMultiDenseVectorStorage { } } -impl VectorStorage for SimpleMultiDenseVectorStorage { +impl VectorStorage for SimpleMultiDenseVectorStorage { fn vector_dim(&self) -> usize { self.dim } @@ -159,16 +163,17 @@ impl VectorStorage for SimpleMultiDenseVectorStorage { fn get_vector(&self, key: PointOffsetType) -> CowVector { let multi_dense_vector = self.vectors.get(key as usize).expect("vector not found"); + let multi_dense_vector = T::into_float_multivector(Cow::Borrowed(multi_dense_vector)); CowVector::from(multi_dense_vector) } fn insert_vector(&mut self, key: PointOffsetType, vector: VectorRef) -> OperationResult<()> { let vector: &MultiDenseVector = vector.try_into()?; - let multi_vector = vector.clone(); + let multi_vector = T::from_float_multivector(Cow::Borrowed(vector)).into_owned(); let key_usize = key as usize; if key_usize >= self.vectors.len() { self.vectors - .resize(key_usize + 1, MultiDenseVector::placeholder(self.dim)); + .resize(key_usize + 1, TypedMultiDenseVector::placeholder(self.dim)); } self.vectors[key_usize] = multi_vector.clone(); self.set_deleted(key, false); @@ -187,8 +192,10 @@ impl VectorStorage for SimpleMultiDenseVectorStorage { check_process_stopped(stopped)?; // Do not perform preprocessing - vectors should be already processed let other_vector = other.get_vector(point_id); - let other_vector: &MultiDenseVector = other_vector.as_vec_ref().try_into()?; - let other_multi_vector = other_vector.clone(); + let other_vector: &TypedMultiDenseVector = + other_vector.as_vec_ref().try_into()?; + let other_multi_vector = + T::from_float_multivector(Cow::Borrowed(other_vector)).into_owned(); let other_deleted = other.is_deleted_vector(point_id); self.vectors.push(other_multi_vector.clone()); let new_id = self.vectors.len() as PointOffsetType - 1; diff --git a/lib/segment/src/vector_storage/vector_storage_base.rs b/lib/segment/src/vector_storage/vector_storage_base.rs index 64625953277..bc3b32053c4 100644 --- a/lib/segment/src/vector_storage/vector_storage_base.rs +++ b/lib/segment/src/vector_storage/vector_storage_base.rs @@ -13,7 +13,7 @@ use crate::common::Flusher; use crate::data_types::named_vectors::CowVector; use crate::data_types::primitive::PrimitiveVectorElement; use crate::data_types::vectors::{ - MultiDenseVector, VectorElementType, VectorElementTypeByte, VectorRef, + TypedMultiDenseVector, VectorElementType, VectorElementTypeByte, VectorRef, }; use crate::types::{Distance, MultiVectorConfig, VectorStorageDatatype}; use crate::vector_storage::dense::appendable_mmap_dense_vector_storage::AppendableMmapDenseVectorStorage; @@ -110,8 +110,8 @@ pub trait SparseVectorStorage: VectorStorage { fn get_sparse(&self, key: PointOffsetType) -> OperationResult; } -pub trait MultiVectorStorage: VectorStorage { - fn get_multi(&self, key: PointOffsetType) -> &MultiDenseVector; +pub trait MultiVectorStorage: VectorStorage { + fn get_multi(&self, key: PointOffsetType) -> &TypedMultiDenseVector; fn multi_vector_config(&self) -> &MultiVectorConfig; } @@ -123,7 +123,7 @@ pub enum VectorStorageEnum { DenseAppendableMemmap(Box>), DenseAppendableMemmapByte(Box>), SparseSimple(SimpleSparseVectorStorage), - MultiDenseSimple(SimpleMultiDenseVectorStorage), + MultiDenseSimple(SimpleMultiDenseVectorStorage), } impl VectorStorage for VectorStorageEnum {