Skip to content

Commit

Permalink
generic multidense vector storage (qdrant#4104)
Browse files Browse the repository at this point in the history
  • Loading branch information
IvanPleshkov authored and generall committed May 2, 2024
1 parent 8e514b8 commit e452b6f
Show file tree
Hide file tree
Showing 8 changed files with 117 additions and 44 deletions.
9 changes: 9 additions & 0 deletions lib/segment/src/data_types/named_vectors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,15 @@ impl<'a> From<MultiDenseVector> for CowVector<'a> {
}
}

impl<'a> From<Cow<'a, MultiDenseVector>> for CowVector<'a> {
fn from(v: Cow<'a, MultiDenseVector>) -> Self {
match v {
Cow::Borrowed(v) => CowVector::MultiDense(Cow::Borrowed(v)),
Cow::Owned(v) => CowVector::MultiDense(Cow::Owned(v)),
}
}
}

impl<'a> From<&'a SparseVector> for CowVector<'a> {
fn from(v: &'a SparseVector) -> Self {
CowVector::Sparse(Cow::Borrowed(v))
Expand Down
47 changes: 47 additions & 0 deletions lib/segment/src/data_types/primitive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ use std::borrow::Cow;
use itertools::Itertools;
use serde::{Deserialize, Serialize};

use super::vectors::TypedMultiDenseVector;
use crate::data_types::vectors::{VectorElementType, VectorElementTypeByte};
use crate::spaces::metric::Metric;
use crate::spaces::simple::{CosineMetric, DotProductMetric, EuclidMetric, ManhattanMetric};
Expand All @@ -22,6 +23,14 @@ pub trait PrimitiveVectorElement:
) -> Cow<'a, [f32]>;

fn datatype() -> VectorStorageDatatype;

fn from_float_multivector(
multivector: Cow<TypedMultiDenseVector<VectorElementType>>,
) -> Cow<TypedMultiDenseVector<Self>>;

fn into_float_multivector(
multivector: Cow<TypedMultiDenseVector<Self>>,
) -> Cow<TypedMultiDenseVector<VectorElementType>>;
}

impl PrimitiveVectorElement for VectorElementType {
Expand All @@ -44,6 +53,18 @@ impl PrimitiveVectorElement for VectorElementType {
fn datatype() -> VectorStorageDatatype {
VectorStorageDatatype::Float32
}

fn from_float_multivector(
multivector: Cow<TypedMultiDenseVector<VectorElementType>>,
) -> Cow<TypedMultiDenseVector<Self>> {
multivector
}

fn into_float_multivector(
multivector: Cow<TypedMultiDenseVector<Self>>,
) -> Cow<TypedMultiDenseVector<VectorElementType>> {
multivector
}
}

impl PrimitiveVectorElement for VectorElementTypeByte {
Expand Down Expand Up @@ -86,4 +107,30 @@ impl PrimitiveVectorElement for VectorElementTypeByte {
fn datatype() -> VectorStorageDatatype {
VectorStorageDatatype::Uint8
}

fn from_float_multivector(
multivector: Cow<TypedMultiDenseVector<VectorElementType>>,
) -> Cow<TypedMultiDenseVector<Self>> {
Cow::Owned(TypedMultiDenseVector::new(
multivector
.inner_vector
.iter()
.map(|&x| x as Self)
.collect_vec(),
multivector.dim,
))
}

fn into_float_multivector(
multivector: Cow<TypedMultiDenseVector<Self>>,
) -> Cow<TypedMultiDenseVector<VectorElementType>> {
Cow::Owned(TypedMultiDenseVector::new(
multivector
.inner_vector
.iter()
.map(|&x| x as VectorElementType)
.collect_vec(),
multivector.dim,
))
}
}
28 changes: 16 additions & 12 deletions lib/segment/src/data_types/vectors.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
use std::collections::HashMap;
use std::slice::ChunksExactMut;

use itertools::Itertools;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use sparse::common::sparse_vector::SparseVector;
use validator::Validate;

use super::named_vectors::NamedVectors;
use super::primitive::PrimitiveVectorElement;
use crate::common::operation_error::OperationError;
use crate::common::utils::transpose_map_into_named_vector;
use crate::vector_storage::query::context_query::ContextQuery;
Expand Down Expand Up @@ -176,13 +178,15 @@ pub type DenseVector = TypedDenseVector<VectorElementType>;

/// Type for multi dense vector
#[derive(Debug, Clone, PartialEq, Deserialize, Serialize)]
pub struct MultiDenseVector {
pub inner_vector: DenseVector, // vectors are flattened into a single vector
pub dim: usize, // dimension of each vector
pub struct TypedMultiDenseVector<T> {
pub inner_vector: TypedDenseVector<T>, // vectors are flattened into a single vector
pub dim: usize, // dimension of each vector
}

impl MultiDenseVector {
pub fn new(flattened_vectors: DenseVector, dim: usize) -> Self {
pub type MultiDenseVector = TypedMultiDenseVector<VectorElementType>;

impl<T: PrimitiveVectorElement> TypedMultiDenseVector<T> {
pub fn new(flattened_vectors: TypedDenseVector<T>, dim: usize) -> Self {
Self {
inner_vector: flattened_vectors,
dim,
Expand All @@ -192,17 +196,17 @@ impl MultiDenseVector {
/// MultiDenseVector cannot be empty, so we use a placeholder vector instead
pub fn placeholder(dim: usize) -> Self {
Self {
inner_vector: vec![1.0; dim],
inner_vector: vec![Default::default(); dim],
dim,
}
}

/// Slices the multi vector into the underlying individual vectors
pub fn multi_vectors(&self) -> impl Iterator<Item = &[VectorElementType]> {
pub fn multi_vectors(&self) -> impl Iterator<Item = &[T]> {
self.inner_vector.chunks_exact(self.dim)
}

pub fn multi_vectors_mut(&mut self) -> ChunksExactMut<'_, VectorElementType> {
pub fn multi_vectors_mut(&mut self) -> ChunksExactMut<'_, T> {
self.inner_vector.chunks_exact_mut(self.dim)
}

Expand All @@ -211,10 +215,10 @@ impl MultiDenseVector {
}
}

impl TryFrom<Vec<DenseVector>> for MultiDenseVector {
impl<T: PrimitiveVectorElement> TryFrom<Vec<TypedDenseVector<T>>> for TypedMultiDenseVector<T> {
type Error = OperationError;

fn try_from(value: Vec<DenseVector>) -> Result<Self, Self::Error> {
fn try_from(value: Vec<TypedDenseVector<T>>) -> Result<Self, Self::Error> {
if value.is_empty() {
return Err(OperationError::ValidationError {
description: "MultiDenseVector cannot be empty".to_string(),
Expand All @@ -228,8 +232,8 @@ impl TryFrom<Vec<DenseVector>> for MultiDenseVector {
received_dim: bad_vec.len(),
})
} else {
let inner_vector = value.into_iter().flatten().collect();
let multi_dense = MultiDenseVector { inner_vector, dim };
let inner_vector = value.into_iter().flatten().collect_vec();
let multi_dense = TypedMultiDenseVector { inner_vector, dim };
Ok(multi_dense)
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use crate::vector_storage::MultiVectorStorage;
pub struct MultiCustomQueryScorer<
'a,
TMetric: Metric<VectorElementType>,
TVectorStorage: MultiVectorStorage,
TVectorStorage: MultiVectorStorage<VectorElementType>,
TQuery: Query<MultiDenseVector>,
> {
vector_storage: &'a TVectorStorage,
Expand All @@ -23,7 +23,7 @@ pub struct MultiCustomQueryScorer<
impl<
'a,
TMetric: Metric<VectorElementType>,
TVectorStorage: MultiVectorStorage,
TVectorStorage: MultiVectorStorage<VectorElementType>,
TQuery: Query<MultiDenseVector> + TransformInto<TQuery, MultiDenseVector, MultiDenseVector>,
> MultiCustomQueryScorer<'a, TMetric, TVectorStorage, TQuery>
{
Expand All @@ -50,7 +50,7 @@ impl<
impl<
'a,
TMetric: Metric<VectorElementType>,
TVectorStorage: MultiVectorStorage,
TVectorStorage: MultiVectorStorage<VectorElementType>,
TQuery: Query<MultiDenseVector>,
> QueryScorer<MultiDenseVector>
for MultiCustomQueryScorer<'a, TMetric, TVectorStorage, TQuery>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,18 @@ use crate::vector_storage::MultiVectorStorage;
pub struct MultiMetricQueryScorer<
'a,
TMetric: Metric<VectorElementType>,
TVectorStorage: MultiVectorStorage,
TVectorStorage: MultiVectorStorage<VectorElementType>,
> {
vector_storage: &'a TVectorStorage,
query: MultiDenseVector,
metric: PhantomData<TMetric>,
}

impl<'a, TMetric: Metric<VectorElementType>, TVectorStorage: MultiVectorStorage>
MultiMetricQueryScorer<'a, TMetric, TVectorStorage>
impl<
'a,
TMetric: Metric<VectorElementType>,
TVectorStorage: MultiVectorStorage<VectorElementType>,
> MultiMetricQueryScorer<'a, TMetric, TVectorStorage>
{
pub fn new(query: MultiDenseVector, vector_storage: &'a TVectorStorage) -> Self {
let slices = query.multi_vectors();
Expand Down Expand Up @@ -47,8 +50,11 @@ impl<'a, TMetric: Metric<VectorElementType>, TVectorStorage: MultiVectorStorage>
}
}

impl<'a, TMetric: Metric<VectorElementType>, TVectorStorage: MultiVectorStorage>
QueryScorer<MultiDenseVector> for MultiMetricQueryScorer<'a, TMetric, TVectorStorage>
impl<
'a,
TMetric: Metric<VectorElementType>,
TVectorStorage: MultiVectorStorage<VectorElementType>,
> QueryScorer<MultiDenseVector> for MultiMetricQueryScorer<'a, TMetric, TVectorStorage>
{
#[inline]
fn score_stored(&self, idx: PointOffsetType) -> ScoreType {
Expand Down
4 changes: 2 additions & 2 deletions lib/segment/src/vector_storage/raw_scorer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,7 @@ where
}))
}

pub fn raw_multi_scorer_impl<'a, TVectorStorage: MultiVectorStorage>(
pub fn raw_multi_scorer_impl<'a, TVectorStorage: MultiVectorStorage<VectorElementType>>(
query: QueryVector,
vector_storage: &'a TVectorStorage,
point_deleted: &'a BitSlice,
Expand Down Expand Up @@ -451,7 +451,7 @@ pub fn raw_multi_scorer_impl<'a, TVectorStorage: MultiVectorStorage>(
fn new_multi_scorer_with_metric<
'a,
TMetric: Metric<VectorElementType> + 'a,
TVectorStorage: MultiVectorStorage,
TVectorStorage: MultiVectorStorage<VectorElementType>,
>(
query: QueryVector,
vector_storage: &'a TVectorStorage,
Expand Down
43 changes: 25 additions & 18 deletions lib/segment/src/vector_storage/simple_multi_dense_vector_storage.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use std::borrow::Cow;
use std::ops::Range;
use std::sync::atomic::AtomicBool;
use std::sync::Arc;
Expand All @@ -12,23 +13,26 @@ use crate::common::operation_error::{check_process_stopped, OperationError, Oper
use crate::common::rocksdb_wrapper::DatabaseColumnWrapper;
use crate::common::Flusher;
use crate::data_types::named_vectors::CowVector;
use crate::data_types::vectors::{MultiDenseVector, VectorRef};
use crate::data_types::primitive::PrimitiveVectorElement;
use crate::data_types::vectors::{
MultiDenseVector, TypedMultiDenseVector, VectorElementType, VectorRef,
};
use crate::types::{Distance, MultiVectorConfig, VectorStorageDatatype};
use crate::vector_storage::bitvec::bitvec_set_deleted;
use crate::vector_storage::common::StoredRecord;
use crate::vector_storage::{MultiVectorStorage, VectorStorage, VectorStorageEnum};

type StoredMultiDenseVector = StoredRecord<MultiDenseVector>;
type StoredMultiDenseVector<T> = StoredRecord<TypedMultiDenseVector<T>>;

/// In-memory vector storage with on-update persistence using `store`
pub struct SimpleMultiDenseVectorStorage {
pub struct SimpleMultiDenseVectorStorage<T: PrimitiveVectorElement> {
dim: usize,
distance: Distance,
multi_vector_config: MultiVectorConfig,
/// Keep vectors in memory
vectors: Vec<MultiDenseVector>,
vectors: Vec<TypedMultiDenseVector<T>>,
db_wrapper: DatabaseColumnWrapper,
update_buffer: StoredMultiDenseVector,
update_buffer: StoredMultiDenseVector<T>,
/// BitVec for deleted flags. Grows dynamically upto last set flag.
deleted: BitVec,
/// Current number of deleted vectors.
Expand All @@ -44,14 +48,14 @@ pub fn open_simple_multi_dense_vector_storage(
multi_vector_config: MultiVectorConfig,
stopped: &AtomicBool,
) -> OperationResult<Arc<AtomicRefCell<VectorStorageEnum>>> {
let mut vectors: Vec<MultiDenseVector> = vec![];
let mut vectors: Vec<TypedMultiDenseVector<VectorElementType>> = vec![];
let (mut deleted, mut deleted_count) = (BitVec::new(), 0);
let db_wrapper = DatabaseColumnWrapper::new(database, database_column_name);
db_wrapper.lock_db().iter()?;
for (key, value) in db_wrapper.lock_db().iter()? {
let point_id: PointOffsetType = bincode::deserialize(&key)
.map_err(|_| OperationError::service_error("cannot deserialize point id from db"))?;
let stored_record: StoredMultiDenseVector = bincode::deserialize(&value)
let stored_record: StoredMultiDenseVector<VectorElementType> = bincode::deserialize(&value)
.map_err(|_| OperationError::service_error("cannot deserialize record from db"))?;

// Propagate deleted flag
Expand All @@ -61,7 +65,7 @@ pub fn open_simple_multi_dense_vector_storage(
}
let point_id_usize = point_id as usize;
if point_id_usize >= vectors.len() {
vectors.resize(point_id_usize + 1, MultiDenseVector::placeholder(dim));
vectors.resize(point_id_usize + 1, TypedMultiDenseVector::placeholder(dim));
}
vectors[point_id_usize] = stored_record.vector;

Expand All @@ -77,15 +81,15 @@ pub fn open_simple_multi_dense_vector_storage(
db_wrapper,
update_buffer: StoredMultiDenseVector {
deleted: false,
vector: MultiDenseVector::placeholder(dim),
vector: TypedMultiDenseVector::placeholder(dim),
},
deleted,
deleted_count,
}),
)))
}

impl SimpleMultiDenseVectorStorage {
impl<T: PrimitiveVectorElement> SimpleMultiDenseVectorStorage<T> {
/// Set deleted flag for given key. Returns previous deleted state.
#[inline]
fn set_deleted(&mut self, key: PointOffsetType, deleted: bool) -> bool {
Expand All @@ -107,7 +111,7 @@ impl SimpleMultiDenseVectorStorage {
&mut self,
key: PointOffsetType,
deleted: bool,
vector: Option<MultiDenseVector>,
vector: Option<TypedMultiDenseVector<T>>,
) -> OperationResult<()> {
// Write vector state to buffer record
let record = &mut self.update_buffer;
Expand All @@ -126,8 +130,8 @@ impl SimpleMultiDenseVectorStorage {
}
}

impl MultiVectorStorage for SimpleMultiDenseVectorStorage {
fn get_multi(&self, key: PointOffsetType) -> &MultiDenseVector {
impl<T: PrimitiveVectorElement> MultiVectorStorage<T> for SimpleMultiDenseVectorStorage<T> {
fn get_multi(&self, key: PointOffsetType) -> &TypedMultiDenseVector<T> {
self.vectors.get(key as usize).expect("vector not found")
}

Expand All @@ -136,7 +140,7 @@ impl MultiVectorStorage for SimpleMultiDenseVectorStorage {
}
}

impl VectorStorage for SimpleMultiDenseVectorStorage {
impl<T: PrimitiveVectorElement> VectorStorage for SimpleMultiDenseVectorStorage<T> {
fn vector_dim(&self) -> usize {
self.dim
}
Expand All @@ -159,16 +163,17 @@ impl VectorStorage for SimpleMultiDenseVectorStorage {

fn get_vector(&self, key: PointOffsetType) -> CowVector {
let multi_dense_vector = self.vectors.get(key as usize).expect("vector not found");
let multi_dense_vector = T::into_float_multivector(Cow::Borrowed(multi_dense_vector));
CowVector::from(multi_dense_vector)
}

fn insert_vector(&mut self, key: PointOffsetType, vector: VectorRef) -> OperationResult<()> {
let vector: &MultiDenseVector = vector.try_into()?;
let multi_vector = vector.clone();
let multi_vector = T::from_float_multivector(Cow::Borrowed(vector)).into_owned();
let key_usize = key as usize;
if key_usize >= self.vectors.len() {
self.vectors
.resize(key_usize + 1, MultiDenseVector::placeholder(self.dim));
.resize(key_usize + 1, TypedMultiDenseVector::placeholder(self.dim));
}
self.vectors[key_usize] = multi_vector.clone();
self.set_deleted(key, false);
Expand All @@ -187,8 +192,10 @@ impl VectorStorage for SimpleMultiDenseVectorStorage {
check_process_stopped(stopped)?;
// Do not perform preprocessing - vectors should be already processed
let other_vector = other.get_vector(point_id);
let other_vector: &MultiDenseVector = other_vector.as_vec_ref().try_into()?;
let other_multi_vector = other_vector.clone();
let other_vector: &TypedMultiDenseVector<VectorElementType> =
other_vector.as_vec_ref().try_into()?;
let other_multi_vector =
T::from_float_multivector(Cow::Borrowed(other_vector)).into_owned();
let other_deleted = other.is_deleted_vector(point_id);
self.vectors.push(other_multi_vector.clone());
let new_id = self.vectors.len() as PointOffsetType - 1;
Expand Down
Loading

0 comments on commit e452b6f

Please sign in to comment.