Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Byte storage integration into segment #4049

Merged
merged 16 commits into from
Apr 17, 2024
Merged
Prev Previous commit
Next Next commit
apply preprocessing for only cosine float metric
  • Loading branch information
IvanPleshkov committed Apr 17, 2024
commit 1f39aa90e7d8f232173064a45cf2b669bd763e4f
56 changes: 47 additions & 9 deletions lib/segment/src/data_types/named_vectors.rs
Original file line number Diff line number Diff line change
@@ -4,9 +4,13 @@ use std::collections::HashMap;
use sparse::common::sparse_vector::SparseVector;

use super::tiny_map;
use super::vectors::{DenseVector, MultiDenseVector, Vector, VectorElementType, VectorRef};
use super::vectors::{
DenseVector, MultiDenseVector, Vector, VectorElementType, VectorElementTypeByte, VectorRef,
};
use crate::common::operation_error::OperationError;
use crate::types::Distance;
use crate::spaces::metric::Metric;
use crate::spaces::simple::{CosineMetric, DotProductMetric, EuclidMetric, ManhattanMetric};
use crate::types::{Distance, SegmentConfig, VectorDataConfig, VectorStorageDatatype};

type CowKey<'a> = Cow<'a, str>;

@@ -212,15 +216,12 @@ impl<'a> NamedVectors<'a> {
self.map.get(key).map(|v| v.as_vec_ref())
}

pub fn preprocess<F>(&mut self, distance_map: F)
where
F: Fn(&str) -> Distance,
{
pub fn preprocess(&mut self, segment_config: &SegmentConfig) {
for (name, vector) in self.map.iter_mut() {
let distance = distance_map(name);
let config = segment_config.vector_data.get(name.as_ref()).unwrap();
match vector {
CowVector::Dense(v) => {
let preprocessed_vector = distance.preprocess_vector(v.to_vec());
let preprocessed_vector = Self::preprocess_dense_vector(v.to_vec(), config);
*vector = CowVector::Dense(Cow::Owned(preprocessed_vector))
}
CowVector::Sparse(v) => {
@@ -229,14 +230,51 @@ impl<'a> NamedVectors<'a> {
}
CowVector::MultiDense(v) => {
for dense_vector in v.to_mut().multi_vectors_mut() {
let preprocessed_vector = distance.preprocess_vector(dense_vector.to_vec());
let preprocessed_vector =
Self::preprocess_dense_vector(dense_vector.to_vec(), config);
// replace dense vector with preprocessed vector
dense_vector.copy_from_slice(&preprocessed_vector);
}
}
}
}
}

fn preprocess_dense_vector(
dense_vector: DenseVector,
config: &VectorDataConfig,
) -> DenseVector {
match config.datatype {
Some(VectorStorageDatatype::Float) | None => match config.distance {
Distance::Cosine => {
<CosineMetric as Metric<VectorElementType>>::preprocess(dense_vector)
}
Distance::Euclid => {
<EuclidMetric as Metric<VectorElementType>>::preprocess(dense_vector)
}
Distance::Dot => {
<DotProductMetric as Metric<VectorElementType>>::preprocess(dense_vector)
}
Distance::Manhattan => {
<ManhattanMetric as Metric<VectorElementType>>::preprocess(dense_vector)
}
},
Some(VectorStorageDatatype::Uint8) => match config.distance {
Distance::Cosine => {
<CosineMetric as Metric<VectorElementTypeByte>>::preprocess(dense_vector)
}
Distance::Euclid => {
<EuclidMetric as Metric<VectorElementTypeByte>>::preprocess(dense_vector)
}
Distance::Dot => {
<DotProductMetric as Metric<VectorElementTypeByte>>::preprocess(dense_vector)
}
Distance::Manhattan => {
<ManhattanMetric as Metric<VectorElementTypeByte>>::preprocess(dense_vector)
}
},
}
}
}

impl<'a> IntoIterator for NamedVectors<'a> {
22 changes: 9 additions & 13 deletions lib/segment/src/data_types/primitive.rs
Original file line number Diff line number Diff line change
@@ -3,16 +3,15 @@ use std::borrow::Cow;
use itertools::Itertools;
use serde::{Deserialize, Serialize};

use crate::common::operation_error::OperationResult;
use crate::data_types::vectors::{VectorElementType, VectorElementTypeByte, VectorRef};
use crate::data_types::vectors::{VectorElementType, VectorElementTypeByte};
use crate::types::QuantizationConfig;

pub trait PrimitiveVectorElement:
Copy + Clone + Default + Serialize + for<'a> Deserialize<'a>
{
fn from_vector_ref(vector: VectorRef) -> OperationResult<Cow<[Self]>>;
fn from_dense_vector(vector: &[VectorElementType]) -> Cow<[Self]>;

fn slice_to_float_cow(vector: &[Self]) -> Cow<[f32]>;
fn slice_to_float_cow(vector: &[Self]) -> Cow<[VectorElementType]>;

fn quantization_preprocess<'a>(
quantization_config: &QuantizationConfig,
@@ -21,12 +20,11 @@ pub trait PrimitiveVectorElement:
}

impl PrimitiveVectorElement for VectorElementType {
fn from_vector_ref(vector: VectorRef) -> OperationResult<Cow<[Self]>> {
let vector_ref: &[Self] = vector.try_into()?;
Ok(Cow::from(vector_ref))
fn from_dense_vector(vector: &[VectorElementType]) -> Cow<[Self]> {
Cow::Borrowed(vector)
}

fn slice_to_float_cow(vector: &[Self]) -> Cow<[f32]> {
fn slice_to_float_cow(vector: &[Self]) -> Cow<[VectorElementType]> {
vector.into()
}

@@ -39,13 +37,11 @@ impl PrimitiveVectorElement for VectorElementType {
}

impl PrimitiveVectorElement for VectorElementTypeByte {
fn from_vector_ref(vector: VectorRef) -> OperationResult<Cow<[Self]>> {
let vector_ref: &[VectorElementType] = vector.try_into()?;
let byte_vector = vector_ref.iter().map(|&x| x as u8).collect::<Vec<u8>>();
Ok(Cow::from(byte_vector))
fn from_dense_vector(vector: &[VectorElementType]) -> Cow<[Self]> {
Cow::Owned(vector.iter().map(|&x| x as u8).collect())
}

fn slice_to_float_cow(vector: &[Self]) -> Cow<[f32]> {
fn slice_to_float_cow(vector: &[Self]) -> Cow<[VectorElementType]> {
Cow::from(vector.iter().map(|&x| x as VectorElementType).collect_vec())
}

2 changes: 1 addition & 1 deletion lib/segment/src/index/hnsw_index/graph_layers.rs
Original file line number Diff line number Diff line change
@@ -451,7 +451,7 @@ mod tests {

let top = 5;
let query = random_vector(&mut rng, dim);
let processed_query = M::preprocess(query.clone());
let processed_query = <M as Metric<VectorElementType>>::preprocess(query.clone());
let mut reference_top = FixedLengthPriorityQueue::new(top);
for idx in 0..vector_holder.vectors.len() as PointOffsetType {
let vec = &vector_holder.vectors.get(idx);
4 changes: 2 additions & 2 deletions lib/segment/src/index/hnsw_index/graph_layers_builder.rs
Original file line number Diff line number Diff line change
@@ -632,7 +632,7 @@ mod tests {

let top = 5;
let query = random_vector(&mut rng, dim);
let processed_query = M::preprocess(query.clone());
let processed_query = <M as Metric<VectorElementType>>::preprocess(query.clone());
let mut reference_top = FixedLengthPriorityQueue::new(top);
for idx in 0..vector_holder.vectors.len() as PointOffsetType {
let vec = &vector_holder.vectors.get(idx);
@@ -715,7 +715,7 @@ mod tests {

let top = 5;
let query = random_vector(&mut rng, dim);
let processed_query = M::preprocess(query.clone());
let processed_query = <M as Metric<VectorElementType>>::preprocess(query.clone());
let mut reference_top = FixedLengthPriorityQueue::new(top);
for idx in 0..vector_holder.vectors.len() as PointOffsetType {
let vec = &vector_holder.vectors.get(idx);
4 changes: 2 additions & 2 deletions lib/segment/src/segment.rs
Original file line number Diff line number Diff line change
@@ -1052,7 +1052,7 @@ impl SegmentEntry for Segment {
) -> OperationResult<bool> {
debug_assert!(self.is_appendable());
check_named_vectors(&vectors, &self.segment_config)?;
vectors.preprocess(|name| self.segment_config.distance(name).unwrap());
vectors.preprocess(&self.segment_config);
let stored_internal_point = self.id_tracker.borrow().internal_id(point_id);
self.handle_point_version_and_failure(op_num, stored_internal_point, |segment| {
if let Some(existing_internal_id) = stored_internal_point {
@@ -1105,7 +1105,7 @@ impl SegmentEntry for Segment {
mut vectors: NamedVectors,
) -> OperationResult<bool> {
check_named_vectors(&vectors, &self.segment_config)?;
vectors.preprocess(|name| self.segment_config.distance(name).unwrap());
vectors.preprocess(&self.segment_config);
let internal_id = self.id_tracker.borrow().internal_id(point_id);
match internal_id {
None => Err(OperationError::PointIdError {
4 changes: 2 additions & 2 deletions lib/segment/src/spaces/metric.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use common::types::ScoreType;

use crate::data_types::primitive::PrimitiveVectorElement;
use crate::data_types::vectors::{TypedDenseVector, VectorElementType};
use crate::data_types::vectors::DenseVector;
use crate::types::Distance;

/// Defines how to compare vectors
@@ -13,7 +13,7 @@ pub trait Metric<T: PrimitiveVectorElement> {

/// Necessary vector transformations performed before adding it to the collection (like normalization)
/// If no transformation is needed - returns the same vector
fn preprocess(vector: TypedDenseVector<VectorElementType>) -> TypedDenseVector<T>;
fn preprocess(vector: DenseVector) -> DenseVector;
}

pub trait MetricPostProcessing {
7 changes: 2 additions & 5 deletions lib/segment/src/spaces/metric_uint/simple_cosine.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use common::types::ScoreType;

use crate::data_types::vectors::{DenseVector, TypedDenseVector, VectorElementTypeByte};
use crate::data_types::vectors::{DenseVector, VectorElementTypeByte};
use crate::spaces::metric::Metric;
#[cfg(target_arch = "x86_64")]
use crate::spaces::metric_uint::avx2::cosine::avx_cosine_similarity_bytes;
@@ -50,11 +50,8 @@ impl Metric<VectorElementTypeByte> for CosineMetric {
cosine_similarity_bytes(v1, v2)
}

fn preprocess(vector: DenseVector) -> TypedDenseVector<VectorElementTypeByte> {
fn preprocess(vector: DenseVector) -> DenseVector {
vector
.into_iter()
.map(|x| x as VectorElementTypeByte)
.collect()
}
}

7 changes: 2 additions & 5 deletions lib/segment/src/spaces/metric_uint/simple_dot.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use common::types::ScoreType;

use crate::data_types::vectors::{DenseVector, TypedDenseVector, VectorElementTypeByte};
use crate::data_types::vectors::{DenseVector, VectorElementTypeByte};
use crate::spaces::metric::Metric;
#[cfg(target_arch = "x86_64")]
use crate::spaces::metric_uint::avx2::dot::avx_dot_similarity_bytes;
@@ -50,11 +50,8 @@ impl Metric<VectorElementTypeByte> for DotProductMetric {
dot_similarity_bytes(v1, v2)
}

fn preprocess(vector: DenseVector) -> TypedDenseVector<VectorElementTypeByte> {
fn preprocess(vector: DenseVector) -> DenseVector {
vector
.into_iter()
.map(|x| x as VectorElementTypeByte)
.collect()
}
}

14 changes: 7 additions & 7 deletions lib/segment/src/spaces/metric_uint/simple_euclid.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use common::types::ScoreType;

use crate::data_types::vectors::{DenseVector, TypedDenseVector, VectorElementTypeByte};
use crate::data_types::vectors::{DenseVector, VectorElementTypeByte};
use crate::spaces::metric::Metric;
#[cfg(target_arch = "x86_64")]
use crate::spaces::metric_uint::avx2::euclid::avx_euclid_similarity_bytes;
@@ -50,11 +50,8 @@ impl Metric<VectorElementTypeByte> for EuclidMetric {
euclid_similarity_bytes(v1, v2)
}

fn preprocess(vector: DenseVector) -> TypedDenseVector<VectorElementTypeByte> {
fn preprocess(vector: DenseVector) -> DenseVector {
vector
.into_iter()
.map(|x| x as VectorElementTypeByte)
.collect()
}
}

@@ -74,12 +71,15 @@ pub fn euclid_similarity_bytes(
#[cfg(test)]
mod tests {
use super::*;
use crate::data_types::primitive::PrimitiveVectorElement;
use crate::data_types::vectors::{TypedDenseVector, VectorElementType};

#[test]
fn test_conversion_to_bytes() {
let dense_vector = DenseVector::from(vec![-10.0, 1.0, 2.0, 3.0, 255., 300.]);
let typed_dense_vector: TypedDenseVector<VectorElementTypeByte> =
EuclidMetric::preprocess(dense_vector);
let preprocessed_vector =
<EuclidMetric as Metric<VectorElementType>>::preprocess(dense_vector);
let typed_dense_vector = VectorElementTypeByte::from_dense_vector(&preprocessed_vector);
let expected: TypedDenseVector<VectorElementTypeByte> = vec![0, 1, 2, 3, 255, 255];
assert_eq!(typed_dense_vector, expected);
}
7 changes: 2 additions & 5 deletions lib/segment/src/spaces/metric_uint/simple_manhattan.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use common::types::ScoreType;

use crate::data_types::vectors::{DenseVector, TypedDenseVector, VectorElementTypeByte};
use crate::data_types::vectors::{DenseVector, VectorElementTypeByte};
use crate::spaces::metric::Metric;
#[cfg(target_arch = "x86_64")]
use crate::spaces::metric_uint::avx2::manhattan::avx_manhattan_similarity_bytes;
@@ -50,11 +50,8 @@ impl Metric<VectorElementTypeByte> for ManhattanMetric {
manhattan_similarity_bytes(v1, v2)
}

fn preprocess(vector: DenseVector) -> TypedDenseVector<VectorElementTypeByte> {
fn preprocess(vector: DenseVector) -> DenseVector {
vector
.into_iter()
.map(|x| x as VectorElementTypeByte)
.collect()
}
}

7 changes: 4 additions & 3 deletions lib/segment/src/spaces/simple.rs
Original file line number Diff line number Diff line change
@@ -246,7 +246,7 @@ mod tests {

#[test]
fn test_cosine_preprocessing() {
let res: DenseVector = CosineMetric::preprocess(vec![0.0, 0.0, 0.0, 0.0]);
let res = <CosineMetric as Metric<VectorElementType>>::preprocess(vec![0.0, 0.0, 0.0, 0.0]);
assert_eq!(res, vec![0.0, 0.0, 0.0, 0.0]);
}

@@ -264,8 +264,9 @@ mod tests {
let vector: Vec<_> = (0..DIM).map(|_| rng.gen_range(range.clone())).collect();

// Preprocess and re-preprocess
let preprocess1 = CosineMetric::preprocess(vector);
let preprocess2: DenseVector = CosineMetric::preprocess(preprocess1.clone());
let preprocess1 = <CosineMetric as Metric<VectorElementType>>::preprocess(vector);
let preprocess2: DenseVector =
<CosineMetric as Metric<VectorElementType>>::preprocess(preprocess1.clone());

// All following preprocess attempts must be the same
assert_eq!(
11 changes: 1 addition & 10 deletions lib/segment/src/types.rs
Original file line number Diff line number Diff line change
@@ -25,7 +25,7 @@ use crate::common::operation_error::{OperationError, OperationResult};
use crate::common::utils::{self, MultiValue};
use crate::data_types::integer_index::IntegerIndexParams;
use crate::data_types::text_index::TextIndexParams;
use crate::data_types::vectors::{DenseVector, VectorElementType, VectorStruct};
use crate::data_types::vectors::{VectorElementType, VectorStruct};
use crate::index::sparse_index::sparse_index_config::{SparseIndexConfig, SparseIndexType};
use crate::json_path::{JsonPath, JsonPathInterface};
use crate::spaces::metric::{Metric, MetricPostProcessing};
@@ -166,15 +166,6 @@ pub enum Distance {
}

impl Distance {
pub fn preprocess_vector(&self, vector: DenseVector) -> DenseVector {
match self {
Distance::Cosine => CosineMetric::preprocess(vector),
Distance::Euclid => EuclidMetric::preprocess(vector),
Distance::Dot => DotProductMetric::preprocess(vector),
Distance::Manhattan => ManhattanMetric::preprocess(vector),
}
}

pub fn postprocess_score(&self, score: ScoreType) -> ScoreType {
match self {
Distance::Cosine => CosineMetric::postprocess(score),
Original file line number Diff line number Diff line change
@@ -116,7 +116,8 @@ impl<T: PrimitiveVectorElement> VectorStorage for AppendableMmapDenseVectorStora
}

fn insert_vector(&mut self, key: PointOffsetType, vector: VectorRef) -> OperationResult<()> {
let vector = T::from_vector_ref(vector)?;
let vector: &[VectorElementType] = vector.try_into()?;
let vector = T::from_dense_vector(vector);
self.vectors.insert(key, vector.as_ref())?;
self.set_deleted(key, false)?;
Ok(())
@@ -134,7 +135,8 @@ impl<T: PrimitiveVectorElement> VectorStorage for AppendableMmapDenseVectorStora
// Do not perform preprocessing - vectors should be already processed
let other_deleted = other.is_deleted_vector(point_id);
let other_vector = other.get_vector(point_id);
let other_vector = T::from_vector_ref(other_vector.as_vec_ref())?;
let other_vector: &[VectorElementType] = other_vector.as_vec_ref().try_into()?;
let other_vector = T::from_dense_vector(other_vector);
let new_id = self.vectors.push(other_vector.as_ref())?;
self.set_deleted(new_id, other_deleted)?;
}
Original file line number Diff line number Diff line change
@@ -142,7 +142,8 @@ impl<T: PrimitiveVectorElement> VectorStorage for MemmapDenseVectorStorage<T> {
for id in other_ids {
check_process_stopped(stopped)?;
let other_vector = other.get_vector(id);
let vector = T::from_vector_ref(other_vector.as_vec_ref())?;
let other_vector: &[VectorElementType] = other_vector.as_vec_ref().try_into()?;
let vector = T::from_dense_vector(other_vector);
let raw_bites = mmap_ops::transmute_to_u8_slice(vector.as_ref());
vectors_file.write_all(raw_bites)?;
end_index += 1;
Loading