DocArray as a vectorstore (#351)

* feat: docarray vectorstore Signed-off-by: jupyterjazz <saba.sturua@jina.ai> * test: fix issues Signed-off-by: jupyterjazz <saba.sturua@jina.ai> * refactor: add pgvector back Signed-off-by: jupyterjazz <saba.sturua@jina.ai> --------- Signed-off-by: jupyterjazz <saba.sturua@jina.ai>
zilliztech · May 16, 2023 · 8e11f55 · 8e11f55
1 parent 543e329
commit 8e11f55
Show file tree

Hide file tree

Showing 11 changed files with 193 additions and 67 deletions.
diff --git a/examples/data_manager/vector_store.py b/examples/data_manager/vector_store.py
@@ -1,9 +1,9 @@
-from gptcache.adapter import openai
-from gptcache import cache
-from gptcache.manager import get_data_manager, CacheBase, VectorBase
-from gptcache.similarity_evaluation.distance import SearchDistanceEvaluation
 import numpy as np
 
+from gptcache import cache
+from gptcache.adapter import openai
+from gptcache.manager import CacheBase, VectorBase, get_data_manager
+from gptcache.similarity_evaluation.distance import SearchDistanceEvaluation
 
 d = 8
 
@@ -17,26 +17,26 @@ def run():
         'faiss',
         'milvus',
         'chromadb',
+        'docarray',
     ]
     for vector_store in vector_stores:
         cache_base = CacheBase('sqlite')
         vector_base = VectorBase(vector_store, dimension=d)
         data_manager = get_data_manager(cache_base, vector_base)
 
-        cache.init(embedding_func=mock_embeddings,
-                   data_manager=data_manager,
-                   similarity_evaluation=SearchDistanceEvaluation(),
-                   )
+        cache.init(
+            embedding_func=mock_embeddings,
+            data_manager=data_manager,
+            similarity_evaluation=SearchDistanceEvaluation(),
+        )
         cache.set_openai_key()
 
         answer = openai.ChatCompletion.create(
             model='gpt-3.5-turbo',
-            messages=[
-                {'role': 'user', 'content': 'what is chatgpt'}
-            ],
+            messages=[{'role': 'user', 'content': 'what is chatgpt'}],
         )
         print(answer)
 
 
 if __name__ == '__main__':
-    run()
+    run()
diff --git a/gptcache/manager/vector_data/__init__.py b/gptcache/manager/vector_data/__init__.py
@@ -18,7 +18,7 @@ def VectorBase(name: str, **kwargs):
     :param name: the name of the vectorbase, it is support 'milvus', 'faiss', 'chromadb', 'hnswlib' now.
     :type name: str
 
-    :param top_k: the umber of the vectors results to return, defaults to 1.
+    :param top_k: the number of the vectors results to return, defaults to 1.
     :type top_k: int
 
     :param dimension: the dimension of the vector, defaults to 0.

diff --git a/gptcache/manager/vector_data/chroma.py b/gptcache/manager/vector_data/chroma.py
@@ -18,7 +18,7 @@ class Chromadb(VectorBase):
     :type persist_directory: str
     :param collection_name: the name of the collection in Chromadb, defaults to 'gptcache'.
     :type collection_name: str
-    :param top_k: the umber of the vectors results to return, defaults to 1.
+    :param top_k: the number of the vectors results to return, defaults to 1.
     :type top_k: int
 
     """

diff --git a/gptcache/manager/vector_data/docarray_index.py b/gptcache/manager/vector_data/docarray_index.py
@@ -0,0 +1,90 @@
+from typing import List, Optional, Tuple
+
+import numpy as np
+from pydantic import parse_obj_as
+
+from gptcache.manager.vector_data.base import VectorBase, VectorData
+from gptcache.utils import import_docarray
+
+import_docarray()
+
+from docarray.typing import NdArray  # pylint: disable=C0413
+from docarray import BaseDoc, DocList  # pylint: disable=C0413
+from docarray.index import InMemoryExactNNIndex  # pylint: disable=C0413
+
+
+class DocarrayVectorData(BaseDoc):
+    """Class representing a vector data element with an ID and associated data."""
+
+    id: int
+    data: NdArray
+
+
+class DocArrayIndex(VectorBase):
+    """
+    Class representing in-memory exact nearest neighbor index for vector search.
+
+    :param index_file_path: the path to docarray index, defaults to 'docarray_index.bin'.
+    :type index_file_path: str
+    :param top_k: the number of the vectors results to return, defaults to 1.
+    :type top_k: int
+    """
+
+    def __init__(self, index_file_path: str, top_k: int):
+        self._index = InMemoryExactNNIndex[DocarrayVectorData](
+            index_file_path=index_file_path
+        )
+        self._index_file_path = index_file_path
+        self._top_k = top_k
+
+    def mul_add(self, datas: List[VectorData]) -> None:
+        """
+        Add multiple vector data elements to the index.
+
+        :param datas: A list of vector data elements to be added.
+        """
+        docs = DocList[DocarrayVectorData](
+            DocarrayVectorData(id=data.id, data=data.data) for data in datas
+        )
+        self._index.index(docs)
+
+    def search(
+        self, data: np.ndarray, top_k: int = -1
+    ) -> Optional[List[Tuple[float, int]]]:
+        """
+        Search for the nearest vector data elements in the index.
+
+        :param data: The query vector data.
+        :param top_k: The number of top matches to return.
+        :return: A list of tuples, each containing the match score and
+            the ID of the matched vector data element.
+        """
+
+        if len(self._index) == 0:
+            return None
+        if top_k == -1:
+            top_k = self._top_k
+        query = parse_obj_as(NdArray, data)
+        docs, scores = self._index.find(query, search_field="data", limit=top_k)
+        return list(zip(scores, docs.id))
+
+    def rebuild(self, ids: Optional[List[int]] = None) -> bool:
+        """
+        In the case of DocArrayIndex, the rebuild operation is not needed.
+        """
+        return True
+
+    def delete(self, ids: Optional[List[str]]) -> None:
+        """
+        Delete the specified vector data elements from the index.
+
+        :param ids: A list of IDs of the vector data elements to be deleted.
+        """
+        if ids is not None:
+            del self._index[ids]
+
+    def flush(self) -> None:
+        self._index.persist(self._index_file_path)
+
+    def close(self) -> None:
+        self.flush()
diff --git a/gptcache/manager/vector_data/faiss.py b/gptcache/manager/vector_data/faiss.py
@@ -18,7 +18,7 @@ class Faiss(VectorBase):
     :type index_path: str
     :param dimension: the dimension of the vector, defaults to 0.
     :type dimension: int
-    :param top_k: the umber of the vectors results to return, defaults to 1.
+    :param top_k: the number of the vectors results to return, defaults to 1.
     :type top_k: int
     """
 

diff --git a/gptcache/manager/vector_data/hnswlib_store.py b/gptcache/manager/vector_data/hnswlib_store.py
@@ -18,7 +18,7 @@ class Hnswlib(VectorBase):
     :type index_path: str
     :param dimension: the dimension of the vector, defaults to 0.
     :type dimension: int
-    :param top_k: the umber of the vectors results to return, defaults to 1.
+    :param top_k: the number of the vectors results to return, defaults to 1.
     :type top_k: int
     :param max_elements: max_elements of hnswlib, defaults 100000.
     :type max_elements: int

diff --git a/gptcache/manager/vector_data/manager.py b/gptcache/manager/vector_data/manager.py
@@ -22,7 +22,6 @@
     "params": {"lists": 100, "probes": 10}
 }
 
-
 COLLECTION_NAME = "gptcache"
 
 
@@ -74,7 +73,7 @@ def get(name, **kwargs):
                 index_params=index_params,
                 search_params=search_params,
                 local_mode=local_mode,
-                local_data=local_data
+                local_data=local_data,
             )
         elif name == "faiss":
             from gptcache.manager.vector_data.faiss import Faiss
@@ -105,8 +104,10 @@ def get(name, **kwargs):
             max_elements = kwargs.pop("max_elements", 100000)
             VectorBase.check_dimension(dimension)
             vector_base = Hnswlib(
-                index_file_path=index_path, dimension=dimension,
-                top_k=top_k, max_elements=max_elements
+                index_file_path=index_path,
+                dimension=dimension,
+                top_k=top_k,
+                max_elements=max_elements,
             )
         elif name == "pgvector":
             from gptcache.manager.vector_data.pgvector import PGVector
@@ -121,6 +122,11 @@ def get(name, **kwargs):
                 collection_name=collection_name,
                 index_params=index_params
             )
+        elif name == "docarray":
+            from gptcache.manager.vector_data.docarray_index import DocArrayIndex
+
+            index_path = kwargs.pop("index_path", "./docarray_index.bin")
+            vector_base = DocArrayIndex(index_file_path=index_path, top_k=top_k)
         else:
             raise NotFoundError("vector store", name)
         return vector_base
diff --git a/gptcache/manager/vector_data/milvus.py b/gptcache/manager/vector_data/milvus.py
@@ -37,7 +37,7 @@ class Milvus(VectorBase):
     :type collection_name: str
     :param dimension: the dimension of the vector, defaults to 0.
     :type dimension: int
-    :param top_k: the umber of the vectors results to return, defaults to 1.
+    :param top_k: the number of the vectors results to return, defaults to 1.
     :type top_k: int
     :param index_params: the index parameters for Milvus, defaults to the HNSW index: {'metric_type': 'L2', 'index_type': 'HNSW', 'params': {'M':
                          8, 'efConstruction': 64}}.

diff --git a/gptcache/manager/vector_data/pgvector.py b/gptcache/manager/vector_data/pgvector.py
@@ -69,7 +69,7 @@ class PGVector(VectorBase):
     :type collection_name: str
     :param dimension: the dimension of the vector, defaults to 0.
     :type dimension: int
-    :param top_k: the umber of the vectors results to return, defaults to 1.
+    :param top_k: the number of the vectors results to return, defaults to 1.
     :type top_k: int
     :param index_params: the index parameters for pgvector, defaults to 'vector_l2_ops' index:
                          {"index_type": "L2", "params": {"lists": 100, "probes": 10}.

diff --git a/gptcache/utils/__init__.py b/gptcache/utils/__init__.py
@@ -30,7 +30,8 @@
     "import_selective_context",
     "import_httpx",
     "import_openai",
-    ]
+    "import_docarray",
+]
 
 import importlib.util
 from typing import Optional
@@ -203,3 +204,7 @@ def import_httpx():
 
 def import_openai():
     _check_library("openai")
+
+
+def import_docarray():
+    _check_library("docarray")