Skip to content

Commit

Permalink
Deep Memory virtual tensors support (activeloopai#2643)
Browse files Browse the repository at this point in the history
* Deep Memory virtual tensors support

* added test

* docs fix

* deepmemory docs fix

* docs fix

* docs fixes

* docs fix

* removed deeplake.core.vectorstore from docs

* query_log fix

* black fix

* tests fix

---------

Co-authored-by: adolkhan <adilkhan.sarsen@alumni.nu.edu.kz>
  • Loading branch information
adolkhan and adolkhan authored Oct 9, 2023
1 parent 4e010d9 commit bdb7323
Show file tree
Hide file tree
Showing 6 changed files with 41 additions and 46 deletions.
31 changes: 15 additions & 16 deletions deeplake/core/vectorstore/deep_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,14 +53,6 @@ def __init__(
self.embedding_function = embedding_function
self.client = client
self.creds = creds or {}
self.queries_dataset = deeplake.dataset(
self.dataset.path + "_eval_queries",
token=token,
read_only=False,
creds=self.creds,
)
if len(self.queries_dataset) == 0:
self.queries_dataset.commit(allow_empty=True)

def train(
self,
Expand Down Expand Up @@ -117,6 +109,7 @@ def train(
runtime = None
if get_path_type(corpus_path) == "hub":
runtime = {"tensor_db": True}

queries_vs = VectorStore(
path=queries_path,
overwrite=True,
Expand Down Expand Up @@ -192,7 +185,6 @@ def status(self, job_id: str):
Examples:
>>> vectorstore.deep_memory.status(job_id)
--------------------------------------------------------------
| 6508464cd80cab681bfcfff3 |
--------------------------------------------------------------
Expand Down Expand Up @@ -281,7 +273,7 @@ def evaluate(
"""Evaluate a model on DeepMemory managed service.
Examples:
# Evaluate a model with embedding function
>>> #1. Evaluate a model with embedding function
>>> relevance: List[List[Tuple[str, int]]] = [[("doc_id_1", 1), ("doc_id_2", 1)], [("doc_id_3", 1)]]
>>> # doc_id_1, doc_id_2, doc_id_3 are the ids of the documents in the corpus dataset that is relevant to the queries. It is stored in the `id` tensor of the corpus dataset.
>>> queries: List[str] = ["What is the capital of India?", "What is the capital of France?"]
Expand All @@ -291,8 +283,7 @@ def evaluate(
... queries=queries,
... embedding_function=embedding_function,
... )
# Evaluate a model with precomputed embeddings
>>> #2. Evaluate a model with precomputed embeddings
>>> relevance: List[List[Tuple[str, int]]] = [[("doc_id_1", 1), ("doc_id_2", 1)], [("doc_id_3", 1)]]
>>> # doc_id_1, doc_id_2, doc_id_3 are the ids of the documents in the corpus dataset that is relevant to the queries. It is stored in the `id` tensor of the corpus dataset.
>>> queries: List[str] = ["What is the capital of India?", "What is the capital of France?"]
Expand All @@ -302,8 +293,7 @@ def evaluate(
... queries=queries,
... embedding=embedding,
... )
# Evaluate a model with precomputed embeddings and log queries
>>> #3. Evaluate a model with precomputed embeddings and log queries
>>> relevance: List[List[Tuple[str, int]]] = [[("doc_id_1", 1), ("doc_id_2", 1)], [("doc_id_3", 1)]]
>>> # doc_id_1, doc_id_2, doc_id_3 are the ids of the documents in the corpus dataset that is relevant to the queries. It is stored in the `id` tensor of the corpus dataset.
>>> queries: List[str] = ["What is the capital of India?", "What is the capital of France?"]
Expand All @@ -316,8 +306,7 @@ def evaluate(
... "log_queries": True,
... }
... )
# Evaluate a model with precomputed embeddings and log queries, and custom branch
>>> #4. Evaluate a model with precomputed embeddings and log queries, and custom branch
>>> relevance: List[List[Tuple[str, int]]] = [[("doc_id_1", 1), ("doc_id_2", 1)], [("doc_id_3", 1)]]
>>> # doc_id_1, doc_id_2, doc_id_3 are the ids of the documents in the corpus dataset that is relevant to the queries. It is stored in the `id` tensor of the corpus dataset.
>>> queries: List[str] = ["What is the capital of India?", "What is the capital of France?"]
Expand Down Expand Up @@ -442,6 +431,16 @@ def evaluate(
if log_queries == False:
return recalls

self.queries_dataset = deeplake.empty(
self.dataset.path + "_eval_queries",
token=self.token,
creds=self.creds,
overwrite=True,
)

if len(self.queries_dataset) == 0:
self.queries_dataset.commit(allow_empty=True)

create = branch not in self.queries_dataset.branches
self.queries_dataset.checkout(parsed_qvs_params["branch"], create=create)

Expand Down
15 changes: 0 additions & 15 deletions deeplake/core/vectorstore/deeplake_vectorstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,11 @@ def __init__(
>>> data = VectorStore(
... path = "./my_vector_store",
... )
>>> # Create a vector store in the Deep Lake Managed Tensor Database
>>> data = VectorStore(
... path = "hub://org_id/dataset_name",
... runtime = {"tensor_db": True},
... )
>>> # Create a vector store with custom tensors
>>> data = VectorStore(
... path = "./my_vector_store",
Expand Down Expand Up @@ -233,22 +231,19 @@ def add(
>>> metadatas = [{"timestamp": "01:20"}, {"timestamp": "01:22"}]
>>> emebdding_fn = lambda x: [[1, 2, 3]] * len(x)
>>> embedding_fn_2 = lambda x: [[4, 5]] * len(x)
>>> # Directly upload embeddings
>>> deeplake_vector_store.add(
... text = texts,
... embedding = embeddings,
... metadata = metadatas,
... )
>>> # Upload embedding via embedding function
>>> deeplake_vector_store.add(
... text = texts,
... metadata = metadatas,
... embedding_function = embedding_fn,
... embedding_data = texts,
... )
>>> # Upload embedding via embedding function to a user-defined embedding tensor
>>> deeplake_vector_store.add(
... text = texts,
Expand All @@ -257,22 +252,19 @@ def add(
... embedding_data = texts,
... embedding_tensor = "embedding_1",
... )
>>> # Multiple embedding functions (user defined embedding tensors must be specified)
>>> deeplake_vector_store.add(
... embedding_tensor = ["embedding_1", "embedding_2"]
... embedding_function = [embedding_fn, embedding_fn_2],
... embedding_data = [texts, texts],
... )
>>> # Alternative syntax for multiple embedding functions
>>> deeplake_vector_store.add(
... text = texts,
... metadata = metadatas,
... embedding_tensor_1 = (embedding_fn, texts),
... embedding_tensor_2 = (embedding_fn_2, texts),
... )
>>> # Add data to fully custom tensors
>>> deeplake_vector_store.add(
... tensor_A = [1, 2],
Expand Down Expand Up @@ -396,21 +388,18 @@ def search(
... embedding = [1, 2, 3],
... exec_option = "python",
... )
>>> # Search using an embedding function and data for embedding
>>> data = vector_store.search(
... embedding_data = "What does this chatbot do?",
... embedding_function = query_embedding_fn,
... exec_option = "compute_engine",
... )
>>> # Add a filter to your search
>>> data = vector_store.search(
... embedding = np.ones(3),
... exec_option = "python",
... filter = {"json_tensor_name": {"key: value"}, "json_tensor_name_2": {"key_2: value_2"},...}, # Only valid for exec_option = "python"
... )
>>> # Search using TQL
>>> data = vector_store.search(
... query = "select * where ..... <add TQL syntax>",
Expand Down Expand Up @@ -553,12 +542,10 @@ def delete(
Examples:
>>> # Delete using ids:
>>> data = vector_store.delete(ids)
>>> # Delete data using filter
>>> data = vector_store.delete(
... filter = {"json_tensor_name": {"key: value"}, "json_tensor_name_2": {"key_2: value_2"}},
... )
>>> # Delete data using TQL
>>> data = vector_store.delete(
... query = "select * where ..... <add TQL syntax>",
Expand Down Expand Up @@ -649,7 +636,6 @@ def update_embedding(
... embedding_tensor = "embedding",
... embedding_function = embedding_function,
... )
>>> # Update data using filter and several embedding_tensors, several embedding_source_tensors
>>> # and several embedding_functions:
>>> data = vector_store.update(
Expand All @@ -658,7 +644,6 @@ def update_embedding(
... filter = {"json_tensor_name": {"key: value"}, "json_tensor_name_2": {"key_2: value_2"}},
... embedding_tensor = ["text_embedding", "metadata_embedding"]
... )
>>> # Update data using TQL, if new embedding function is not specified the embedding_function used
>>> # during initialization will be used
>>> data = vector_store.update(
Expand Down
27 changes: 15 additions & 12 deletions deeplake/core/vectorstore/test_deepmemory.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,12 +311,13 @@ def test_deepmemory_evaluate_without_logging(
},
)
sleep(15)
with pytest.raises(ValueError):
queries_dataset = VectorStore(
path=query_path,
token=hub_cloud_dev_token,
read_only=True,
)

queries_dataset = VectorStore(
path=query_path,
token=hub_cloud_dev_token,
read_only=True,
)
assert len(queries_dataset) == 0


@pytest.mark.slow
Expand Down Expand Up @@ -386,12 +387,13 @@ def test_deepmemory_evaluate_without_qvs_params(
)

sleep(15)
with pytest.raises(ValueError):
queries_dataset = VectorStore(
path=query_path,
token=hub_cloud_dev_token,
read_only=True,
)

queries_dataset = VectorStore(
path=query_path,
token=hub_cloud_dev_token,
read_only=True,
)
assert len(queries_dataset) == 0


@pytest.mark.slow
Expand Down Expand Up @@ -565,3 +567,4 @@ def test_deepmemory_search_on_local_datasets(
output = corpus.search(embedding=query_embedding, deep_memory=True, k=10)

assert correct_id in output["id"]
assert "score" in output
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from deeplake.core.vectorstore.vector_search.indra import query
from deeplake.core.vectorstore.vector_search import utils
from deeplake.core.dataset import Dataset as DeepLakeDataset
from deeplake.core.dataset.deeplake_query_dataset import DeepLakeQueryDataset
from deeplake.enterprise.util import raise_indra_installation_error


Expand Down Expand Up @@ -83,8 +84,9 @@ def search(
api.tql.prepare_deepmemory_metrics(indra_dataset)

indra_view = indra_dataset.query(tql_query)
indexes = indra_view.indexes
view = deeplake_dataset[indexes]

view = DeepLakeQueryDataset(deeplake_ds=deeplake_dataset, indra_ds=indra_view)
view._tql_query = tql_query

return_data = {}

Expand Down
6 changes: 6 additions & 0 deletions docs/source/deeplake.VectorStore.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
deeplake.VectorStore
--------------------

.. autoclass:: deeplake.core.vectorstore.deeplake_vectorstore.VectorStore
:members:
:show-inheritance:
2 changes: 1 addition & 1 deletion docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ Deep Lake is an open-source database for AI.
:caption: API Reference

deeplake
deeplake.VectorStore
deeplake.core
deeplake.core.dataset
deeplake.core.tensor
Expand All @@ -58,7 +59,6 @@ Deep Lake is an open-source database for AI.
deeplake.util
deeplake.client.log
deeplake.core.transform
deeplake.core.vectorstore
deeplake.core.vectorstore.deep_memory
deeplake.random

Expand Down

0 comments on commit bdb7323

Please sign in to comment.