From bdb7323571e4645bcc2c521459d5068e563f9f99 Mon Sep 17 00:00:00 2001 From: Adilkhan Sarsen <54854336+adolkhan@users.noreply.github.com> Date: Tue, 10 Oct 2023 01:36:58 +0600 Subject: [PATCH] Deep Memory virtual tensors support (#2643) * Deep Memory virtual tensors support * added test * docs fix * deepmemory docs fix * docs fix * docs fixes * docs fix * removed deeplake.core.vectorstore from docs * query_log fix * black fix * tests fix --------- Co-authored-by: adolkhan --- deeplake/core/vectorstore/deep_memory.py | 31 +++++++++---------- .../core/vectorstore/deeplake_vectorstore.py | 15 --------- deeplake/core/vectorstore/test_deepmemory.py | 27 +++++++++------- .../vector_search/indra/search_algorithm.py | 6 ++-- docs/source/deeplake.VectorStore.rst | 6 ++++ docs/source/index.rst | 2 +- 6 files changed, 41 insertions(+), 46 deletions(-) create mode 100644 docs/source/deeplake.VectorStore.rst diff --git a/deeplake/core/vectorstore/deep_memory.py b/deeplake/core/vectorstore/deep_memory.py index 4034ca35ea..e23a411b85 100644 --- a/deeplake/core/vectorstore/deep_memory.py +++ b/deeplake/core/vectorstore/deep_memory.py @@ -53,14 +53,6 @@ def __init__( self.embedding_function = embedding_function self.client = client self.creds = creds or {} - self.queries_dataset = deeplake.dataset( - self.dataset.path + "_eval_queries", - token=token, - read_only=False, - creds=self.creds, - ) - if len(self.queries_dataset) == 0: - self.queries_dataset.commit(allow_empty=True) def train( self, @@ -117,6 +109,7 @@ def train( runtime = None if get_path_type(corpus_path) == "hub": runtime = {"tensor_db": True} + queries_vs = VectorStore( path=queries_path, overwrite=True, @@ -192,7 +185,6 @@ def status(self, job_id: str): Examples: >>> vectorstore.deep_memory.status(job_id) - -------------------------------------------------------------- | 6508464cd80cab681bfcfff3 | -------------------------------------------------------------- @@ -281,7 +273,7 @@ def evaluate( """Evaluate a model on DeepMemory managed service. Examples: - # Evaluate a model with embedding function + >>> #1. Evaluate a model with embedding function >>> relevance: List[List[Tuple[str, int]]] = [[("doc_id_1", 1), ("doc_id_2", 1)], [("doc_id_3", 1)]] >>> # doc_id_1, doc_id_2, doc_id_3 are the ids of the documents in the corpus dataset that is relevant to the queries. It is stored in the `id` tensor of the corpus dataset. >>> queries: List[str] = ["What is the capital of India?", "What is the capital of France?"] @@ -291,8 +283,7 @@ def evaluate( ... queries=queries, ... embedding_function=embedding_function, ... ) - - # Evaluate a model with precomputed embeddings + >>> #2. Evaluate a model with precomputed embeddings >>> relevance: List[List[Tuple[str, int]]] = [[("doc_id_1", 1), ("doc_id_2", 1)], [("doc_id_3", 1)]] >>> # doc_id_1, doc_id_2, doc_id_3 are the ids of the documents in the corpus dataset that is relevant to the queries. It is stored in the `id` tensor of the corpus dataset. >>> queries: List[str] = ["What is the capital of India?", "What is the capital of France?"] @@ -302,8 +293,7 @@ def evaluate( ... queries=queries, ... embedding=embedding, ... ) - - # Evaluate a model with precomputed embeddings and log queries + >>> #3. Evaluate a model with precomputed embeddings and log queries >>> relevance: List[List[Tuple[str, int]]] = [[("doc_id_1", 1), ("doc_id_2", 1)], [("doc_id_3", 1)]] >>> # doc_id_1, doc_id_2, doc_id_3 are the ids of the documents in the corpus dataset that is relevant to the queries. It is stored in the `id` tensor of the corpus dataset. >>> queries: List[str] = ["What is the capital of India?", "What is the capital of France?"] @@ -316,8 +306,7 @@ def evaluate( ... "log_queries": True, ... } ... ) - - # Evaluate a model with precomputed embeddings and log queries, and custom branch + >>> #4. Evaluate a model with precomputed embeddings and log queries, and custom branch >>> relevance: List[List[Tuple[str, int]]] = [[("doc_id_1", 1), ("doc_id_2", 1)], [("doc_id_3", 1)]] >>> # doc_id_1, doc_id_2, doc_id_3 are the ids of the documents in the corpus dataset that is relevant to the queries. It is stored in the `id` tensor of the corpus dataset. >>> queries: List[str] = ["What is the capital of India?", "What is the capital of France?"] @@ -442,6 +431,16 @@ def evaluate( if log_queries == False: return recalls + self.queries_dataset = deeplake.empty( + self.dataset.path + "_eval_queries", + token=self.token, + creds=self.creds, + overwrite=True, + ) + + if len(self.queries_dataset) == 0: + self.queries_dataset.commit(allow_empty=True) + create = branch not in self.queries_dataset.branches self.queries_dataset.checkout(parsed_qvs_params["branch"], create=create) diff --git a/deeplake/core/vectorstore/deeplake_vectorstore.py b/deeplake/core/vectorstore/deeplake_vectorstore.py index 617f5d47cd..864d62bcb3 100644 --- a/deeplake/core/vectorstore/deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/deeplake_vectorstore.py @@ -64,13 +64,11 @@ def __init__( >>> data = VectorStore( ... path = "./my_vector_store", ... ) - >>> # Create a vector store in the Deep Lake Managed Tensor Database >>> data = VectorStore( ... path = "hub://org_id/dataset_name", ... runtime = {"tensor_db": True}, ... ) - >>> # Create a vector store with custom tensors >>> data = VectorStore( ... path = "./my_vector_store", @@ -233,14 +231,12 @@ def add( >>> metadatas = [{"timestamp": "01:20"}, {"timestamp": "01:22"}] >>> emebdding_fn = lambda x: [[1, 2, 3]] * len(x) >>> embedding_fn_2 = lambda x: [[4, 5]] * len(x) - >>> # Directly upload embeddings >>> deeplake_vector_store.add( ... text = texts, ... embedding = embeddings, ... metadata = metadatas, ... ) - >>> # Upload embedding via embedding function >>> deeplake_vector_store.add( ... text = texts, @@ -248,7 +244,6 @@ def add( ... embedding_function = embedding_fn, ... embedding_data = texts, ... ) - >>> # Upload embedding via embedding function to a user-defined embedding tensor >>> deeplake_vector_store.add( ... text = texts, @@ -257,14 +252,12 @@ def add( ... embedding_data = texts, ... embedding_tensor = "embedding_1", ... ) - >>> # Multiple embedding functions (user defined embedding tensors must be specified) >>> deeplake_vector_store.add( ... embedding_tensor = ["embedding_1", "embedding_2"] ... embedding_function = [embedding_fn, embedding_fn_2], ... embedding_data = [texts, texts], ... ) - >>> # Alternative syntax for multiple embedding functions >>> deeplake_vector_store.add( ... text = texts, @@ -272,7 +265,6 @@ def add( ... embedding_tensor_1 = (embedding_fn, texts), ... embedding_tensor_2 = (embedding_fn_2, texts), ... ) - >>> # Add data to fully custom tensors >>> deeplake_vector_store.add( ... tensor_A = [1, 2], @@ -396,21 +388,18 @@ def search( ... embedding = [1, 2, 3], ... exec_option = "python", ... ) - >>> # Search using an embedding function and data for embedding >>> data = vector_store.search( ... embedding_data = "What does this chatbot do?", ... embedding_function = query_embedding_fn, ... exec_option = "compute_engine", ... ) - >>> # Add a filter to your search >>> data = vector_store.search( ... embedding = np.ones(3), ... exec_option = "python", ... filter = {"json_tensor_name": {"key: value"}, "json_tensor_name_2": {"key_2: value_2"},...}, # Only valid for exec_option = "python" ... ) - >>> # Search using TQL >>> data = vector_store.search( ... query = "select * where ..... ", @@ -553,12 +542,10 @@ def delete( Examples: >>> # Delete using ids: >>> data = vector_store.delete(ids) - >>> # Delete data using filter >>> data = vector_store.delete( ... filter = {"json_tensor_name": {"key: value"}, "json_tensor_name_2": {"key_2: value_2"}}, ... ) - >>> # Delete data using TQL >>> data = vector_store.delete( ... query = "select * where ..... ", @@ -649,7 +636,6 @@ def update_embedding( ... embedding_tensor = "embedding", ... embedding_function = embedding_function, ... ) - >>> # Update data using filter and several embedding_tensors, several embedding_source_tensors >>> # and several embedding_functions: >>> data = vector_store.update( @@ -658,7 +644,6 @@ def update_embedding( ... filter = {"json_tensor_name": {"key: value"}, "json_tensor_name_2": {"key_2: value_2"}}, ... embedding_tensor = ["text_embedding", "metadata_embedding"] ... ) - >>> # Update data using TQL, if new embedding function is not specified the embedding_function used >>> # during initialization will be used >>> data = vector_store.update( diff --git a/deeplake/core/vectorstore/test_deepmemory.py b/deeplake/core/vectorstore/test_deepmemory.py index 6f09d75a72..4c30236c51 100644 --- a/deeplake/core/vectorstore/test_deepmemory.py +++ b/deeplake/core/vectorstore/test_deepmemory.py @@ -311,12 +311,13 @@ def test_deepmemory_evaluate_without_logging( }, ) sleep(15) - with pytest.raises(ValueError): - queries_dataset = VectorStore( - path=query_path, - token=hub_cloud_dev_token, - read_only=True, - ) + + queries_dataset = VectorStore( + path=query_path, + token=hub_cloud_dev_token, + read_only=True, + ) + assert len(queries_dataset) == 0 @pytest.mark.slow @@ -386,12 +387,13 @@ def test_deepmemory_evaluate_without_qvs_params( ) sleep(15) - with pytest.raises(ValueError): - queries_dataset = VectorStore( - path=query_path, - token=hub_cloud_dev_token, - read_only=True, - ) + + queries_dataset = VectorStore( + path=query_path, + token=hub_cloud_dev_token, + read_only=True, + ) + assert len(queries_dataset) == 0 @pytest.mark.slow @@ -565,3 +567,4 @@ def test_deepmemory_search_on_local_datasets( output = corpus.search(embedding=query_embedding, deep_memory=True, k=10) assert correct_id in output["id"] + assert "score" in output diff --git a/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py b/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py index ae6c87917e..d1228e6906 100644 --- a/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py +++ b/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py @@ -4,6 +4,7 @@ from deeplake.core.vectorstore.vector_search.indra import query from deeplake.core.vectorstore.vector_search import utils from deeplake.core.dataset import Dataset as DeepLakeDataset +from deeplake.core.dataset.deeplake_query_dataset import DeepLakeQueryDataset from deeplake.enterprise.util import raise_indra_installation_error @@ -83,8 +84,9 @@ def search( api.tql.prepare_deepmemory_metrics(indra_dataset) indra_view = indra_dataset.query(tql_query) - indexes = indra_view.indexes - view = deeplake_dataset[indexes] + + view = DeepLakeQueryDataset(deeplake_ds=deeplake_dataset, indra_ds=indra_view) + view._tql_query = tql_query return_data = {} diff --git a/docs/source/deeplake.VectorStore.rst b/docs/source/deeplake.VectorStore.rst new file mode 100644 index 0000000000..e12aa2fb65 --- /dev/null +++ b/docs/source/deeplake.VectorStore.rst @@ -0,0 +1,6 @@ +deeplake.VectorStore +-------------------- + +.. autoclass:: deeplake.core.vectorstore.deeplake_vectorstore.VectorStore + :members: + :show-inheritance: \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst index 253475641e..f9820aa89a 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -50,6 +50,7 @@ Deep Lake is an open-source database for AI. :caption: API Reference deeplake + deeplake.VectorStore deeplake.core deeplake.core.dataset deeplake.core.tensor @@ -58,7 +59,6 @@ Deep Lake is an open-source database for AI. deeplake.util deeplake.client.log deeplake.core.transform - deeplake.core.vectorstore deeplake.core.vectorstore.deep_memory deeplake.random