Skip to content

Commit

Permalink
DocArray as a vectorstore (#351)
Browse files Browse the repository at this point in the history
* feat: docarray vectorstore

Signed-off-by: jupyterjazz <saba.sturua@jina.ai>

* test: fix issues

Signed-off-by: jupyterjazz <saba.sturua@jina.ai>

* refactor: add pgvector back

Signed-off-by: jupyterjazz <saba.sturua@jina.ai>

---------

Signed-off-by: jupyterjazz <saba.sturua@jina.ai>
  • Loading branch information
jupyterjazz authored May 16, 2023
1 parent 543e329 commit 8e11f55
Show file tree
Hide file tree
Showing 11 changed files with 193 additions and 67 deletions.
24 changes: 12 additions & 12 deletions examples/data_manager/vector_store.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from gptcache.adapter import openai
from gptcache import cache
from gptcache.manager import get_data_manager, CacheBase, VectorBase
from gptcache.similarity_evaluation.distance import SearchDistanceEvaluation
import numpy as np

from gptcache import cache
from gptcache.adapter import openai
from gptcache.manager import CacheBase, VectorBase, get_data_manager
from gptcache.similarity_evaluation.distance import SearchDistanceEvaluation

d = 8

Expand All @@ -17,26 +17,26 @@ def run():
'faiss',
'milvus',
'chromadb',
'docarray',
]
for vector_store in vector_stores:
cache_base = CacheBase('sqlite')
vector_base = VectorBase(vector_store, dimension=d)
data_manager = get_data_manager(cache_base, vector_base)

cache.init(embedding_func=mock_embeddings,
data_manager=data_manager,
similarity_evaluation=SearchDistanceEvaluation(),
)
cache.init(
embedding_func=mock_embeddings,
data_manager=data_manager,
similarity_evaluation=SearchDistanceEvaluation(),
)
cache.set_openai_key()

answer = openai.ChatCompletion.create(
model='gpt-3.5-turbo',
messages=[
{'role': 'user', 'content': 'what is chatgpt'}
],
messages=[{'role': 'user', 'content': 'what is chatgpt'}],
)
print(answer)


if __name__ == '__main__':
run()
run()
2 changes: 1 addition & 1 deletion gptcache/manager/vector_data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def VectorBase(name: str, **kwargs):
:param name: the name of the vectorbase, it is support 'milvus', 'faiss', 'chromadb', 'hnswlib' now.
:type name: str
:param top_k: the umber of the vectors results to return, defaults to 1.
:param top_k: the number of the vectors results to return, defaults to 1.
:type top_k: int
:param dimension: the dimension of the vector, defaults to 0.
Expand Down
2 changes: 1 addition & 1 deletion gptcache/manager/vector_data/chroma.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class Chromadb(VectorBase):
:type persist_directory: str
:param collection_name: the name of the collection in Chromadb, defaults to 'gptcache'.
:type collection_name: str
:param top_k: the umber of the vectors results to return, defaults to 1.
:param top_k: the number of the vectors results to return, defaults to 1.
:type top_k: int
"""
Expand Down
90 changes: 90 additions & 0 deletions gptcache/manager/vector_data/docarray_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from typing import List, Optional, Tuple

import numpy as np
from pydantic import parse_obj_as

from gptcache.manager.vector_data.base import VectorBase, VectorData
from gptcache.utils import import_docarray

import_docarray()

from docarray.typing import NdArray # pylint: disable=C0413
from docarray import BaseDoc, DocList # pylint: disable=C0413
from docarray.index import InMemoryExactNNIndex # pylint: disable=C0413


class DocarrayVectorData(BaseDoc):
"""Class representing a vector data element with an ID and associated data."""

id: int
data: NdArray


class DocArrayIndex(VectorBase):
"""
Class representing in-memory exact nearest neighbor index for vector search.
:param index_file_path: the path to docarray index, defaults to 'docarray_index.bin'.
:type index_file_path: str
:param top_k: the number of the vectors results to return, defaults to 1.
:type top_k: int
"""

def __init__(self, index_file_path: str, top_k: int):
self._index = InMemoryExactNNIndex[DocarrayVectorData](
index_file_path=index_file_path
)
self._index_file_path = index_file_path
self._top_k = top_k

def mul_add(self, datas: List[VectorData]) -> None:
"""
Add multiple vector data elements to the index.
:param datas: A list of vector data elements to be added.
"""
docs = DocList[DocarrayVectorData](
DocarrayVectorData(id=data.id, data=data.data) for data in datas
)
self._index.index(docs)

def search(
self, data: np.ndarray, top_k: int = -1
) -> Optional[List[Tuple[float, int]]]:
"""
Search for the nearest vector data elements in the index.
:param data: The query vector data.
:param top_k: The number of top matches to return.
:return: A list of tuples, each containing the match score and
the ID of the matched vector data element.
"""

if len(self._index) == 0:
return None
if top_k == -1:
top_k = self._top_k
query = parse_obj_as(NdArray, data)
docs, scores = self._index.find(query, search_field="data", limit=top_k)
return list(zip(scores, docs.id))

def rebuild(self, ids: Optional[List[int]] = None) -> bool:
"""
In the case of DocArrayIndex, the rebuild operation is not needed.
"""
return True

def delete(self, ids: Optional[List[str]]) -> None:
"""
Delete the specified vector data elements from the index.
:param ids: A list of IDs of the vector data elements to be deleted.
"""
if ids is not None:
del self._index[ids]

def flush(self) -> None:
self._index.persist(self._index_file_path)

def close(self) -> None:
self.flush()
2 changes: 1 addition & 1 deletion gptcache/manager/vector_data/faiss.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class Faiss(VectorBase):
:type index_path: str
:param dimension: the dimension of the vector, defaults to 0.
:type dimension: int
:param top_k: the umber of the vectors results to return, defaults to 1.
:param top_k: the number of the vectors results to return, defaults to 1.
:type top_k: int
"""

Expand Down
2 changes: 1 addition & 1 deletion gptcache/manager/vector_data/hnswlib_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class Hnswlib(VectorBase):
:type index_path: str
:param dimension: the dimension of the vector, defaults to 0.
:type dimension: int
:param top_k: the umber of the vectors results to return, defaults to 1.
:param top_k: the number of the vectors results to return, defaults to 1.
:type top_k: int
:param max_elements: max_elements of hnswlib, defaults 100000.
:type max_elements: int
Expand Down
14 changes: 10 additions & 4 deletions gptcache/manager/vector_data/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
"params": {"lists": 100, "probes": 10}
}


COLLECTION_NAME = "gptcache"


Expand Down Expand Up @@ -74,7 +73,7 @@ def get(name, **kwargs):
index_params=index_params,
search_params=search_params,
local_mode=local_mode,
local_data=local_data
local_data=local_data,
)
elif name == "faiss":
from gptcache.manager.vector_data.faiss import Faiss
Expand Down Expand Up @@ -105,8 +104,10 @@ def get(name, **kwargs):
max_elements = kwargs.pop("max_elements", 100000)
VectorBase.check_dimension(dimension)
vector_base = Hnswlib(
index_file_path=index_path, dimension=dimension,
top_k=top_k, max_elements=max_elements
index_file_path=index_path,
dimension=dimension,
top_k=top_k,
max_elements=max_elements,
)
elif name == "pgvector":
from gptcache.manager.vector_data.pgvector import PGVector
Expand All @@ -121,6 +122,11 @@ def get(name, **kwargs):
collection_name=collection_name,
index_params=index_params
)
elif name == "docarray":
from gptcache.manager.vector_data.docarray_index import DocArrayIndex

index_path = kwargs.pop("index_path", "./docarray_index.bin")
vector_base = DocArrayIndex(index_file_path=index_path, top_k=top_k)
else:
raise NotFoundError("vector store", name)
return vector_base
2 changes: 1 addition & 1 deletion gptcache/manager/vector_data/milvus.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class Milvus(VectorBase):
:type collection_name: str
:param dimension: the dimension of the vector, defaults to 0.
:type dimension: int
:param top_k: the umber of the vectors results to return, defaults to 1.
:param top_k: the number of the vectors results to return, defaults to 1.
:type top_k: int
:param index_params: the index parameters for Milvus, defaults to the HNSW index: {'metric_type': 'L2', 'index_type': 'HNSW', 'params': {'M':
8, 'efConstruction': 64}}.
Expand Down
2 changes: 1 addition & 1 deletion gptcache/manager/vector_data/pgvector.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ class PGVector(VectorBase):
:type collection_name: str
:param dimension: the dimension of the vector, defaults to 0.
:type dimension: int
:param top_k: the umber of the vectors results to return, defaults to 1.
:param top_k: the number of the vectors results to return, defaults to 1.
:type top_k: int
:param index_params: the index parameters for pgvector, defaults to 'vector_l2_ops' index:
{"index_type": "L2", "params": {"lists": 100, "probes": 10}.
Expand Down
7 changes: 6 additions & 1 deletion gptcache/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@
"import_selective_context",
"import_httpx",
"import_openai",
]
"import_docarray",
]

import importlib.util
from typing import Optional
Expand Down Expand Up @@ -203,3 +204,7 @@ def import_httpx():

def import_openai():
_check_library("openai")


def import_docarray():
_check_library("docarray")
Loading

0 comments on commit 8e11f55

Please sign in to comment.