Deep Memory virtual tensors support (activeloopai#2643)

* Deep Memory virtual tensors support * added test * docs fix * deepmemory docs fix * docs fix * docs fixes * docs fix * removed deeplake.core.vectorstore from docs * query_log fix * black fix * tests fix --------- Co-authored-by: adolkhan <adilkhan.sarsen@alumni.nu.edu.kz>
puyuanOT · Oct 9, 2023 · bdb7323 · bdb7323
1 parent 4e010d9
commit bdb7323
Show file tree

Hide file tree

Showing 6 changed files with 41 additions and 46 deletions.
diff --git a/deeplake/core/vectorstore/deep_memory.py b/deeplake/core/vectorstore/deep_memory.py
@@ -53,14 +53,6 @@ def __init__(
         self.embedding_function = embedding_function
         self.client = client
         self.creds = creds or {}
-        self.queries_dataset = deeplake.dataset(
-            self.dataset.path + "_eval_queries",
-            token=token,
-            read_only=False,
-            creds=self.creds,
-        )
-        if len(self.queries_dataset) == 0:
-            self.queries_dataset.commit(allow_empty=True)
 
     def train(
         self,
@@ -117,6 +109,7 @@ def train(
         runtime = None
         if get_path_type(corpus_path) == "hub":
             runtime = {"tensor_db": True}
+
         queries_vs = VectorStore(
             path=queries_path,
             overwrite=True,
@@ -192,7 +185,6 @@ def status(self, job_id: str):
 
         Examples:
             >>> vectorstore.deep_memory.status(job_id)
-
             --------------------------------------------------------------
             |                  6508464cd80cab681bfcfff3                  |
             --------------------------------------------------------------
@@ -281,7 +273,7 @@ def evaluate(
         """Evaluate a model on DeepMemory managed service.
 
         Examples:
-            # Evaluate a model with embedding function
+            >>> #1. Evaluate a model with embedding function
             >>> relevance: List[List[Tuple[str, int]]] = [[("doc_id_1", 1), ("doc_id_2", 1)], [("doc_id_3", 1)]]
             >>> # doc_id_1, doc_id_2, doc_id_3 are the ids of the documents in the corpus dataset that is relevant to the queries. It is stored in the `id` tensor of the corpus dataset.
             >>> queries: List[str] = ["What is the capital of India?", "What is the capital of France?"]
@@ -291,8 +283,7 @@ def evaluate(
             ...     queries=queries,
             ...     embedding_function=embedding_function,
             ... )
-
-            # Evaluate a model with precomputed embeddings
+            >>> #2. Evaluate a model with precomputed embeddings
             >>> relevance: List[List[Tuple[str, int]]] = [[("doc_id_1", 1), ("doc_id_2", 1)], [("doc_id_3", 1)]]
             >>> # doc_id_1, doc_id_2, doc_id_3 are the ids of the documents in the corpus dataset that is relevant to the queries. It is stored in the `id` tensor of the corpus dataset.
             >>> queries: List[str] = ["What is the capital of India?", "What is the capital of France?"]
@@ -302,8 +293,7 @@ def evaluate(
             ...     queries=queries,
             ...     embedding=embedding,
             ... )
-
-            # Evaluate a model with precomputed embeddings and log queries
+            >>> #3. Evaluate a model with precomputed embeddings and log queries
             >>> relevance: List[List[Tuple[str, int]]] = [[("doc_id_1", 1), ("doc_id_2", 1)], [("doc_id_3", 1)]]
             >>> # doc_id_1, doc_id_2, doc_id_3 are the ids of the documents in the corpus dataset that is relevant to the queries. It is stored in the `id` tensor of the corpus dataset.
             >>> queries: List[str] = ["What is the capital of India?", "What is the capital of France?"]
@@ -316,8 +306,7 @@ def evaluate(
             ...         "log_queries": True,
             ...     }
             ... )
-
-            # Evaluate a model with precomputed embeddings and log queries, and custom branch
+            >>> #4. Evaluate a model with precomputed embeddings and log queries, and custom branch
             >>> relevance: List[List[Tuple[str, int]]] = [[("doc_id_1", 1), ("doc_id_2", 1)], [("doc_id_3", 1)]]
             >>> # doc_id_1, doc_id_2, doc_id_3 are the ids of the documents in the corpus dataset that is relevant to the queries. It is stored in the `id` tensor of the corpus dataset.
             >>> queries: List[str] = ["What is the capital of India?", "What is the capital of France?"]
@@ -442,6 +431,16 @@ def evaluate(
         if log_queries == False:
             return recalls
 
+        self.queries_dataset = deeplake.empty(
+            self.dataset.path + "_eval_queries",
+            token=self.token,
+            creds=self.creds,
+            overwrite=True,
+        )
+
+        if len(self.queries_dataset) == 0:
+            self.queries_dataset.commit(allow_empty=True)
+
         create = branch not in self.queries_dataset.branches
         self.queries_dataset.checkout(parsed_qvs_params["branch"], create=create)
 

diff --git a/deeplake/core/vectorstore/deeplake_vectorstore.py b/deeplake/core/vectorstore/deeplake_vectorstore.py
@@ -64,13 +64,11 @@ def __init__(
             >>> data = VectorStore(
             ...        path = "./my_vector_store",
             ... )
-
             >>> # Create a vector store in the Deep Lake Managed Tensor Database
             >>> data = VectorStore(
             ...        path = "hub://org_id/dataset_name",
             ...        runtime = {"tensor_db": True},
             ... )
-
             >>> # Create a vector store with custom tensors
             >>> data = VectorStore(
             ...        path = "./my_vector_store",
@@ -233,22 +231,19 @@ def add(
             >>> metadatas = [{"timestamp": "01:20"}, {"timestamp": "01:22"}]
             >>> emebdding_fn = lambda x: [[1, 2, 3]] * len(x)
             >>> embedding_fn_2 = lambda x: [[4, 5]] * len(x)
-
             >>> # Directly upload embeddings
             >>> deeplake_vector_store.add(
             ...     text = texts,
             ...     embedding = embeddings,
             ...     metadata = metadatas,
             ... )
-
             >>> # Upload embedding via embedding function
             >>> deeplake_vector_store.add(
             ...     text = texts,
             ...     metadata = metadatas,
             ...     embedding_function = embedding_fn,
             ...     embedding_data = texts,
             ... )
-
             >>> # Upload embedding via embedding function to a user-defined embedding tensor
             >>> deeplake_vector_store.add(
             ...     text = texts,
@@ -257,22 +252,19 @@ def add(
             ...     embedding_data = texts,
             ...     embedding_tensor = "embedding_1",
             ... )
-
             >>> # Multiple embedding functions (user defined embedding tensors must be specified)
             >>> deeplake_vector_store.add(
             ...     embedding_tensor = ["embedding_1", "embedding_2"]
             ...     embedding_function = [embedding_fn, embedding_fn_2],
             ...     embedding_data = [texts, texts],
             ... )
-
             >>> # Alternative syntax for multiple embedding functions
             >>> deeplake_vector_store.add(
             ...     text = texts,
             ...     metadata = metadatas,
             ...     embedding_tensor_1 = (embedding_fn, texts),
             ...     embedding_tensor_2 = (embedding_fn_2, texts),
             ... )
-
             >>> # Add data to fully custom tensors
             >>> deeplake_vector_store.add(
             ...     tensor_A = [1, 2],
@@ -396,21 +388,18 @@ def search(
             ...        embedding = [1, 2, 3],
             ...        exec_option = "python",
             ... )
-
             >>> # Search using an embedding function and data for embedding
             >>> data = vector_store.search(
             ...        embedding_data = "What does this chatbot do?",
             ...        embedding_function = query_embedding_fn,
             ...        exec_option = "compute_engine",
             ... )
-
             >>> # Add a filter to your search
             >>> data = vector_store.search(
             ...        embedding = np.ones(3),
             ...        exec_option = "python",
             ...        filter = {"json_tensor_name": {"key: value"}, "json_tensor_name_2": {"key_2: value_2"},...}, # Only valid for exec_option = "python"
             ... )
-
             >>> # Search using TQL
             >>> data = vector_store.search(
             ...        query = "select * where ..... <add TQL syntax>",
@@ -553,12 +542,10 @@ def delete(
         Examples:
             >>> # Delete using ids:
             >>> data = vector_store.delete(ids)
-
             >>> # Delete data using filter
             >>> data = vector_store.delete(
             ...        filter = {"json_tensor_name": {"key: value"}, "json_tensor_name_2": {"key_2: value_2"}},
             ... )
-
             >>> # Delete data using TQL
             >>> data = vector_store.delete(
             ...        query = "select * where ..... <add TQL syntax>",
@@ -649,7 +636,6 @@ def update_embedding(
             ...    embedding_tensor = "embedding",
             ...    embedding_function = embedding_function,
             ... )
-
             >>> # Update data using filter and several embedding_tensors, several embedding_source_tensors
             >>> # and several embedding_functions:
             >>> data = vector_store.update(
@@ -658,7 +644,6 @@ def update_embedding(
             ...     filter = {"json_tensor_name": {"key: value"}, "json_tensor_name_2": {"key_2: value_2"}},
             ...     embedding_tensor = ["text_embedding", "metadata_embedding"]
             ... )
-
             >>> # Update data using TQL, if new embedding function is not specified the embedding_function used
             >>> # during initialization will be used
             >>> data = vector_store.update(

diff --git a/deeplake/core/vectorstore/test_deepmemory.py b/deeplake/core/vectorstore/test_deepmemory.py
@@ -311,12 +311,13 @@ def test_deepmemory_evaluate_without_logging(
         },
     )
     sleep(15)
-    with pytest.raises(ValueError):
-        queries_dataset = VectorStore(
-            path=query_path,
-            token=hub_cloud_dev_token,
-            read_only=True,
-        )
+
+    queries_dataset = VectorStore(
+        path=query_path,
+        token=hub_cloud_dev_token,
+        read_only=True,
+    )
+    assert len(queries_dataset) == 0
 
 
 @pytest.mark.slow
@@ -386,12 +387,13 @@ def test_deepmemory_evaluate_without_qvs_params(
     )
 
     sleep(15)
-    with pytest.raises(ValueError):
-        queries_dataset = VectorStore(
-            path=query_path,
-            token=hub_cloud_dev_token,
-            read_only=True,
-        )
+
+    queries_dataset = VectorStore(
+        path=query_path,
+        token=hub_cloud_dev_token,
+        read_only=True,
+    )
+    assert len(queries_dataset) == 0
 
 
 @pytest.mark.slow
@@ -565,3 +567,4 @@ def test_deepmemory_search_on_local_datasets(
     output = corpus.search(embedding=query_embedding, deep_memory=True, k=10)
 
     assert correct_id in output["id"]
+    assert "score" in output
diff --git a/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py b/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py
@@ -4,6 +4,7 @@
 from deeplake.core.vectorstore.vector_search.indra import query
 from deeplake.core.vectorstore.vector_search import utils
 from deeplake.core.dataset import Dataset as DeepLakeDataset
+from deeplake.core.dataset.deeplake_query_dataset import DeepLakeQueryDataset
 from deeplake.enterprise.util import raise_indra_installation_error
 
 
@@ -83,8 +84,9 @@ def search(
         api.tql.prepare_deepmemory_metrics(indra_dataset)
 
         indra_view = indra_dataset.query(tql_query)
-        indexes = indra_view.indexes
-        view = deeplake_dataset[indexes]
+
+        view = DeepLakeQueryDataset(deeplake_ds=deeplake_dataset, indra_ds=indra_view)
+        view._tql_query = tql_query
 
         return_data = {}
 

diff --git a/docs/source/deeplake.VectorStore.rst b/docs/source/deeplake.VectorStore.rst
@@ -0,0 +1,6 @@
+deeplake.VectorStore
+--------------------
+
+.. autoclass:: deeplake.core.vectorstore.deeplake_vectorstore.VectorStore
+    :members:
+    :show-inheritance:
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -50,6 +50,7 @@ Deep Lake is an open-source database for AI.
    :caption: API Reference
 
    deeplake
+   deeplake.VectorStore
    deeplake.core
    deeplake.core.dataset
    deeplake.core.tensor
@@ -58,7 +59,6 @@ Deep Lake is an open-source database for AI.
    deeplake.util
    deeplake.client.log
    deeplake.core.transform
-   deeplake.core.vectorstore
    deeplake.core.vectorstore.deep_memory
    deeplake.random