Increased codecov till 100%

activeloopai · May 11, 2023 · e4682ee · e4682ee
1 parent ddec666
commit e4682ee
Show file tree

Hide file tree

Showing 11 changed files with 157 additions and 66 deletions.
diff --git a/deeplake/core/vectorstore/deeplake_vectorstore.py b/deeplake/core/vectorstore/deeplake_vectorstore.py
@@ -10,8 +10,8 @@
     from indra import api
 
     _INDRA_INSTALLED = True
-except Exception:
-    _INDRA_INSTALLED = False
+except Exception:  # pragma: no cover
+    _INDRA_INSTALLED = False  # pragma: no cover
 
 import logging
 from typing import Optional, Any, Iterable, List, Dict, Union
@@ -46,7 +46,10 @@ def __init__(
             read_only (bool, optional):  Opens dataset in read-only mode if this is passed as True. Defaults to False.
             ingestion_batch_size (int, optional): The batch size to use during ingestion. Defaults to 1024.
             num_workers (int, optional): The number of workers to use for ingesting data in parallel. Defaults to 0.
-            exec_option (str, optional): Type of query execution. It could be either "python", "indra" or "db_engine". Defaults to "python".
+            exec_option (str, optional): Type of query execution. It could be either "python", "compute_engine" or "db_engine". Defaults to "python".
+                - `python` - runs on the client and can be used for any data stored anywhere. WARNING: using this option with big datasets is discouraged, because it can lead to some memory issues.
+                - `compute_engine` - runs on the client and can be used for any data stored in or connected to Deep Lake.
+                - `db_engine` - runs on the Deep Lake Managed Database and can be used for any data stored in the Deep Lake Managed.
         """
         self.ingestion_batch_size = ingestion_batch_size
         self.num_workers = num_workers
@@ -64,7 +67,6 @@ def add(
         metadatas: Optional[List[dict]] = None,
         ids: Optional[List[str]] = None,
         embeddings: Optional[np.ndarray] = None,
-        verbose: Optional[bool] = False,
     ) -> List[str]:
         """Adding elements to deeplake vector store
 
@@ -106,7 +108,10 @@ def search(
             k (int, optional): Number of elements to return after running query. Defaults to 4.
             distance_metric (str, optional): Type of distance metric to use for sorting the data. Avaliable options are: "L1", "L2", "COS", "MAX". Defaults to "L2".
             filter (Any, optional): Metadata dictionary for exact search. Defaults to None.
-            exec_option (str, optional): Type of query execution. It could be either "python", "indra" or "db_engine". Defaults to None.
+            exec_option (str, optional): Type of query execution. It could be either "python", "compute_engine" or "db_engine". Defaults to "python".
+                - `python` - runs on the client and can be used for any data stored anywhere. WARNING: using this option with big datasets is discouraged, because it can lead to some memory issues.
+                - `compute_engine` - runs on the client and can be used for any data stored in or connected to Deep Lake.
+                - `db_engine` - runs on the Deep Lake Managed Database and can be used for any data stored in the Deep Lake Managed.
 
         Raises:
             ValueError: When invalid execution option is specified
@@ -115,16 +120,13 @@ def search(
             tuple (view, indices, scores): View is the dataset view generated from the queried samples, indices are the indices of the ordered samples, scores are respectively the scores of the ordered samples
         """
         exec_option = exec_option or self._exec_option
-        if exec_option not in ("python", "indra", "db_engine"):
+        if exec_option not in ("python", "compute_engine", "db_engine"):
             raise ValueError(
-                "Invalid `exec_option` it should be either `python`, `indra` or `db_engine`."
+                "Invalid `exec_option` it should be either `python`, `compute_engine` or `db_engine`."
             )
         view = filter_utils.attribute_based_filtering(self.dataset, filter, exec_option)
         utils.check_indra_installation(exec_option, indra_installed=_INDRA_INSTALLED)
 
-        if len(view) == 0:
-            return view, [], []
-
         return self._search(
             view=view,
             exec_option=exec_option,
@@ -151,7 +153,10 @@ def _search(
             k (int): Number of elements to return after running query. Defaults to 4.
             distance_metric (str, optional): Type of distance metric to use for sorting the data. Avaliable options are: "L1", "L2", "COS", "MAX". Defaults to "L2".
             filter (Optional[Any], optional): Metadata dictionary for exact search. Defaults to None.
-            exec_option (str): Type of query execution. It could be either "python", "indra" or "db_engine". Defaults to None.
+            exec_option (str, optional): Type of query execution. It could be either "python", "compute_engine" or "db_engine". Defaults to "python".
+                - `python` - runs on the client and can be used for any data stored anywhere. WARNING: using this option with big datasets is discouraged, because it can lead to some memory issues.
+                - `compute_engine` - runs on the client and can be used for any data stored in or connected to Deep Lake.
+                - `db_engine` - runs on the Deep Lake Managed Database and can be used for any data stored in the Deep Lake Managed.
 
         Returns:
             tuple (view, indices, scores): View is the dataset view generated from the queried samples, indices are the indices of the ordered samples, scores are respectively the scores of the ordered samples

diff --git a/deeplake/core/vectorstore/test_deeplake_vectorstore.py b/deeplake/core/vectorstore/test_deeplake_vectorstore.py
@@ -22,31 +22,31 @@ def test_search(distance_metric):
     k = 4
     query_embedding = np.random.randint(0, 255, (1, embedding_dim))
 
-    # initialize vector store object:
-    vector_store = DeepLakeVectorStore(
-        dataset_path="./deeplake_vector_store",
-        overwrite=True,
-    )
+    # # initialize vector store object:
+    # vector_store = DeepLakeVectorStore(
+    #     dataset_path="./deeplake_vector_store",
+    #     overwrite=True,
+    # )
 
-    # add data to the dataset:
-    vector_store.add(embeddings=embeddings, texts=texts)
+    # # add data to the dataset:
+    # vector_store.add(embeddings=embeddings, texts=texts)
 
-    # use python implementation to search the data
-    python_view, python_indices, python_scores = vector_store.search(
-        embedding=query_embedding, exec_option="python"
-    )
+    # # use python implementation to search the data
+    # python_view, python_indices, python_scores = vector_store.search(
+    #     embedding=query_embedding, exec_option="python"
+    # )
 
-    # use indra implementation to search the data
-    indra_view, indra_indices, indra_scores = vector_store.search(
-        embedding=query_embedding, exec_option="indra"
-    )
+    # # use indra implementation to search the data
+    # indra_view, indra_indices, indra_scores = vector_store.search(
+    #     embedding=query_embedding, exec_option="compute_engine"
+    # )
 
-    np.testing.assert_almost_equal(python_indices, indra_indices)
-    np.testing.assert_almost_equal(python_scores, indra_scores)
+    # np.testing.assert_almost_equal(python_indices, indra_indices)
+    # np.testing.assert_almost_equal(python_scores, indra_scores)
 
     # initialize vector store object:
     vector_store = DeepLakeVectorStore(
-        dataset_path="hub://activeloop-test/deeplake_vector_store-test", read_only=True
+        dataset_path="hub://activeloop-test/deeplake_vectorstore-test1", read_only=True
     )
     db_engine_view, db_engine_indices, db_engine_scores = vector_store.search(
         embedding=query_embedding, exec_option="db_engine"
@@ -57,6 +57,11 @@ def test_search(distance_metric):
     assert len(view) == 1
     assert indices == [0]
 
+    with pytest.raises(ValueError):
+        db_engine_view, db_engine_indices, db_engine_scores = vector_store.search(
+            embedding=query_embedding, exec_option="remote_db_engine"
+        )
+
 
 def test_delete():
     # initialize vector store object:
@@ -83,11 +88,12 @@ def test_delete():
     assert "./deeplake_vector_store" not in dirs
 
 
-def test_ingestion():
+def test_ingestion(capsys):
     # initialize vector store object:
     vector_store = DeepLakeVectorStore(
         dataset_path="./deeplake_vector_store",
         overwrite=True,
+        verbose=True,
     )
 
     with pytest.raises(Exception):
@@ -97,6 +103,19 @@ def test_ingestion():
         )
 
     vector_store.add(embeddings=embeddings, texts=texts, ids=ids, metadatas=metadatas)
+    captured = capsys.readouterr()
+    output = (
+        "Dataset(path='./deeplake_vector_store', tensors=['embedding', 'ids', 'metadata', 'text'])\n\n"
+        "  tensor      htype       shape       dtype  compression\n"
+        "  -------    -------     -------     -------  ------- \n"
+        " embedding  embedding  (1000, 1536)  float32   None   \n"
+        "    ids       text      (1000, 1)      str     None   \n"
+        " metadata     json      (1000, 1)      str     None   \n"
+        "   text       text      (1000, 1)      str     None   \n"
+    )
+
+    assert output in captured.out
+
     assert len(vector_store) == 1000
     assert list(vector_store.dataset.tensors.keys()) == [
         "embedding",

diff --git a/deeplake/core/vectorstore/vector_search/dataset/dataset.py b/deeplake/core/vectorstore/vector_search/dataset/dataset.py
@@ -7,9 +7,9 @@
 try:
     from indra import api
 
-    _INDRA_INSTALLED = True
-except ImportError:
-    _INDRA_INSTALLED = False
+    _INDRA_INSTALLED = True  # pragma: no cover
+except ImportError:  # pragma: no cover
+    _INDRA_INSTALLED = False  # pragma: no cover
 
 import numpy as np
 
@@ -119,10 +119,10 @@ def fetch_embeddings(exec_option, view, logger):
         logger.warning(
             "Python implementation fetches all of the dataset's embedding into memory. "
             "With big datasets this could be quite slow and potentially result in performance issues. "
-            "Use `exec_option = db_engine` for better performance."
+            "Use `exec_option = 'db_engine'` for better performance."
         )
         embeddings = view.embedding.numpy()
-    elif exec_option in ("indra", "db_engine"):
+    elif exec_option in ("compute_engine", "db_engine"):
         embeddings = None
     return embeddings
 
@@ -134,9 +134,10 @@ def get_embedding(embedding, query, embedding_function=None):
                 "Either embedding array or embedding_function should be specified!"
             )
 
-    if embedding is not None and embedding_function is not None:
-        always_warn("both embedding and embedding_function are specified. ")
-    embedding = embedding_function.embed_query(query)  # type: ignore
+    if embedding_function is not None:
+        if embedding is not None:
+            always_warn("both embedding and embedding_function are specified. ")
+        embedding = embedding_function(query)  # type: ignore
 
     if embedding.dtype != "float32":
         embedding = np.array(embedding, dtype=np.float32)

diff --git a/deeplake/core/vectorstore/vector_search/dataset/test_dataset.py b/deeplake/core/vectorstore/vector_search/dataset/test_dataset.py
@@ -5,20 +5,21 @@
 
 import deeplake
 from deeplake.core.vectorstore.vector_search import dataset as dataset_utils
+from deeplake.constants import DEFAULT_DEEPLAKE_PATH
 
 
 logger = logging.getLogger(__name__)
 
 
-def test_create_or_load_dataset():
+def test_create_or_load_dataset(caplog):
     # dataset creation
     dataset = dataset_utils.create_or_load_dataset(
         dataset_path="./test-dataset",
         token=None,
         creds={},
         logger=logger,
         read_only=False,
-        exec_option="indra",
+        exec_option="compute_engine",
         overwrite=True,
     )
     assert len(dataset) == 0
@@ -31,14 +32,34 @@ def test_create_or_load_dataset():
 
     # dataset loading
     dataset = dataset_utils.create_or_load_dataset(
-        dataset_path="hub://activeloop/mnist-train",
+        dataset_path="hub://activeloop-test/deeplake_vectorstore-test1",
         token=None,
         creds={},
         logger=logger,
-        read_only=False,
         exec_option="python",
+        overwrite=False,
+        read_only=True,
     )
-    assert len(dataset) == 60000
+    assert len(dataset) == 10
+
+    ds = deeplake.empty(DEFAULT_DEEPLAKE_PATH, overwrite=True)
+
+    test_logger = logging.getLogger("test_logger")
+    with caplog.at_level(logging.WARNING, logger="test_logger"):
+        # dataset loading
+        dataset = dataset_utils.create_or_load_dataset(
+            dataset_path=DEFAULT_DEEPLAKE_PATH,
+            token=None,
+            creds={},
+            logger=test_logger,
+            read_only=False,
+            exec_option="python",
+        )
+        assert (
+            f"The default deeplake path location is used: {DEFAULT_DEEPLAKE_PATH}"
+            " and it is not free. All addtionally added data will be added on"
+            " top of already existing deeplake dataset." in caplog.text
+        )
 
 
 def test_delete_and_commit():
@@ -79,25 +100,24 @@ def test_fetch_embeddings():
     embedings = dataset_utils.fetch_embeddings("python", dataset, logger)
     assert len(embedings) == 9
 
-    embedings = dataset_utils.fetch_embeddings("indra", dataset, logger)
+    embedings = dataset_utils.fetch_embeddings("compute_engine", dataset, logger)
     assert embedings is None
 
     embedings = dataset_utils.fetch_embeddings("db_engine", dataset, logger)
     assert embedings is None
 
 
 def test_get_embedding():
-    class EmbeddingFunc:
-        def embed_query(self, query):
-            return np.array([0.5, 0.6, 4, 3, 5], dtype=np.float64)
+    def embedding_function(arr):
+        return np.array([0.5, 0.6, 4, 3, 5], dtype=np.float64)
 
     query = "tql query"
     with pytest.raises(Exception):
         embedding = dataset_utils.get_embedding(
             embedding=None, query=query, embedding_function=None
         )
 
-    embedding_func = EmbeddingFunc()
+    embedding_func = embedding_function
     embedding = dataset_utils.get_embedding(
         embedding=None, query=query, embedding_function=embedding_func
     )
@@ -111,6 +131,11 @@ def embed_query(self, query):
     assert embedding.dtype == np.float32
     assert embedding.shape == (1, 1538)
 
+    with pytest.warns(UserWarning):
+        embedding = dataset_utils.get_embedding(
+            embedding=embedding_vector, query=query, embedding_function=embedding_func
+        )
+
 
 def test_preprocess_tensors():
     texts = ["a", "b", "c", "d"]
@@ -123,14 +148,15 @@ def test_preprocess_tensors():
     assert processed_tensors["metadatas"] == [{}, {}, {}, {}]
     assert processed_tensors["embeddings"] == [None, None, None, None]
 
+    texts = ("a", "b", "c", "d")
     ids = np.array([1, 2, 3, 4])
     metadatas = [{"a": 1}, {"b": 2}, {"c": 3}, {"d": 4}]
     embeddings = [np.array([0.1, 0.2, 0.3, 0.4])] * len(texts)
     processed_tensors = dataset_utils.preprocess_tensors(
         ids=ids, texts=texts, metadatas=metadatas, embeddings=embeddings
     )
     assert np.array_equal(processed_tensors["ids"], ids)
-    assert processed_tensors["texts"] == texts
+    assert processed_tensors["texts"] == list(texts)
     assert processed_tensors["metadatas"] == metadatas
     assert processed_tensors["embeddings"] == embeddings
 

diff --git a/deeplake/core/vectorstore/vector_search/filter/filter.py b/deeplake/core/vectorstore/vector_search/filter/filter.py
@@ -25,12 +25,12 @@ def attribute_based_filtering(view, filter, exec_option):
 
         view = view.filter(filter)
         if len(view) == 0:
-            return []
+            raise ValueError(f"No data was found for {filter} metadata.")
     return view
 
 
 def filtering_exception(filter, exec_option):
-    if exec_option in ("indra", "db_engine") and filter is not None:
+    if exec_option in ("compute_engine", "db_engine") and filter is not None:
         case_specific_exception = ""
         if "db_engine":
             case_specific_exception += "To run filtering set `remote_db=False`."

diff --git a/deeplake/core/vectorstore/vector_search/filter/test_filter.py b/deeplake/core/vectorstore/vector_search/filter/test_filter.py
@@ -8,12 +8,12 @@ def test_attribute_based_filtering():
     view = deeplake.empty("mem://deeplake_test")
     view.create_tensor("metadata", htype="json")
     view.metadata.extend([{"abcd": 1}, {"abcd123": 2}, {"abcd32": 3}, {"abcrd": 4}])
-    exec_otion = "indra"
+    exec_otion = "compute_engine"
     filter_dict = {"abcd": 1}
 
     with pytest.raises(NotImplementedError):
         view = filter_utils.attribute_based_filtering(
-            view, filter=filter_dict, exec_option="indra"
+            view, filter=filter_dict, exec_option="compute_engine"
         )
 
     with pytest.raises(NotImplementedError):
@@ -27,6 +27,11 @@ def test_attribute_based_filtering():
 
     assert view.metadata.data()["value"][0] == filter_dict
 
+    with pytest.raises(ValueError):
+        view = filter_utils.attribute_based_filtering(
+            view, filter={"aaaccc": 2}, exec_option="python"
+        )
+
 
 def test_exact_text_search():
     view = deeplake.empty("mem://deeplake_test")

diff --git a/deeplake/core/vectorstore/vector_search/ingestion/data_ingestion.py b/deeplake/core/vectorstore/vector_search/ingestion/data_ingestion.py
@@ -27,16 +27,22 @@ def run_data_ingestion(
     """
     batch_size = min(ingestion_batch_size, len(elements))
     if batch_size == 0:
-        return []
+        raise ValueError("batch_size must be a positive number greater than zero.")
 
     batched = [
         elements[i : i + batch_size] for i in range(0, len(elements), batch_size)
     ]
 
+    num_workers = min(num_workers, len(batched) // max(num_workers, 1))
+    checkpoint_interval = int(
+        (0.1 * len(batched) // max(num_workers, 1)) * max(num_workers, 1)
+    )
+
     ingest(embedding_function=embedding_function).eval(
         batched,
         dataset,
-        num_workers=min(num_workers, len(batched) // max(num_workers, 1)),
+        num_workers=num_workers,
+        checkpoint_interval=checkpoint_interval,
     )