Skip to content

Commit

Permalink
Increased codecov till 100%
Browse files Browse the repository at this point in the history
  • Loading branch information
adolkhan committed May 11, 2023
1 parent ddec666 commit e4682ee
Show file tree
Hide file tree
Showing 11 changed files with 157 additions and 66 deletions.
27 changes: 16 additions & 11 deletions deeplake/core/vectorstore/deeplake_vectorstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
from indra import api

_INDRA_INSTALLED = True
except Exception:
_INDRA_INSTALLED = False
except Exception: # pragma: no cover
_INDRA_INSTALLED = False # pragma: no cover

import logging
from typing import Optional, Any, Iterable, List, Dict, Union
Expand Down Expand Up @@ -46,7 +46,10 @@ def __init__(
read_only (bool, optional): Opens dataset in read-only mode if this is passed as True. Defaults to False.
ingestion_batch_size (int, optional): The batch size to use during ingestion. Defaults to 1024.
num_workers (int, optional): The number of workers to use for ingesting data in parallel. Defaults to 0.
exec_option (str, optional): Type of query execution. It could be either "python", "indra" or "db_engine". Defaults to "python".
exec_option (str, optional): Type of query execution. It could be either "python", "compute_engine" or "db_engine". Defaults to "python".
- `python` - runs on the client and can be used for any data stored anywhere. WARNING: using this option with big datasets is discouraged, because it can lead to some memory issues.
- `compute_engine` - runs on the client and can be used for any data stored in or connected to Deep Lake.
- `db_engine` - runs on the Deep Lake Managed Database and can be used for any data stored in the Deep Lake Managed.
"""
self.ingestion_batch_size = ingestion_batch_size
self.num_workers = num_workers
Expand All @@ -64,7 +67,6 @@ def add(
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
embeddings: Optional[np.ndarray] = None,
verbose: Optional[bool] = False,
) -> List[str]:
"""Adding elements to deeplake vector store
Expand Down Expand Up @@ -106,7 +108,10 @@ def search(
k (int, optional): Number of elements to return after running query. Defaults to 4.
distance_metric (str, optional): Type of distance metric to use for sorting the data. Avaliable options are: "L1", "L2", "COS", "MAX". Defaults to "L2".
filter (Any, optional): Metadata dictionary for exact search. Defaults to None.
exec_option (str, optional): Type of query execution. It could be either "python", "indra" or "db_engine". Defaults to None.
exec_option (str, optional): Type of query execution. It could be either "python", "compute_engine" or "db_engine". Defaults to "python".
- `python` - runs on the client and can be used for any data stored anywhere. WARNING: using this option with big datasets is discouraged, because it can lead to some memory issues.
- `compute_engine` - runs on the client and can be used for any data stored in or connected to Deep Lake.
- `db_engine` - runs on the Deep Lake Managed Database and can be used for any data stored in the Deep Lake Managed.
Raises:
ValueError: When invalid execution option is specified
Expand All @@ -115,16 +120,13 @@ def search(
tuple (view, indices, scores): View is the dataset view generated from the queried samples, indices are the indices of the ordered samples, scores are respectively the scores of the ordered samples
"""
exec_option = exec_option or self._exec_option
if exec_option not in ("python", "indra", "db_engine"):
if exec_option not in ("python", "compute_engine", "db_engine"):
raise ValueError(
"Invalid `exec_option` it should be either `python`, `indra` or `db_engine`."
"Invalid `exec_option` it should be either `python`, `compute_engine` or `db_engine`."
)
view = filter_utils.attribute_based_filtering(self.dataset, filter, exec_option)
utils.check_indra_installation(exec_option, indra_installed=_INDRA_INSTALLED)

if len(view) == 0:
return view, [], []

return self._search(
view=view,
exec_option=exec_option,
Expand All @@ -151,7 +153,10 @@ def _search(
k (int): Number of elements to return after running query. Defaults to 4.
distance_metric (str, optional): Type of distance metric to use for sorting the data. Avaliable options are: "L1", "L2", "COS", "MAX". Defaults to "L2".
filter (Optional[Any], optional): Metadata dictionary for exact search. Defaults to None.
exec_option (str): Type of query execution. It could be either "python", "indra" or "db_engine". Defaults to None.
exec_option (str, optional): Type of query execution. It could be either "python", "compute_engine" or "db_engine". Defaults to "python".
- `python` - runs on the client and can be used for any data stored anywhere. WARNING: using this option with big datasets is discouraged, because it can lead to some memory issues.
- `compute_engine` - runs on the client and can be used for any data stored in or connected to Deep Lake.
- `db_engine` - runs on the Deep Lake Managed Database and can be used for any data stored in the Deep Lake Managed.
Returns:
tuple (view, indices, scores): View is the dataset view generated from the queried samples, indices are the indices of the ordered samples, scores are respectively the scores of the ordered samples
Expand Down
57 changes: 38 additions & 19 deletions deeplake/core/vectorstore/test_deeplake_vectorstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,31 +22,31 @@ def test_search(distance_metric):
k = 4
query_embedding = np.random.randint(0, 255, (1, embedding_dim))

# initialize vector store object:
vector_store = DeepLakeVectorStore(
dataset_path="./deeplake_vector_store",
overwrite=True,
)
# # initialize vector store object:
# vector_store = DeepLakeVectorStore(
# dataset_path="./deeplake_vector_store",
# overwrite=True,
# )

# add data to the dataset:
vector_store.add(embeddings=embeddings, texts=texts)
# # add data to the dataset:
# vector_store.add(embeddings=embeddings, texts=texts)

# use python implementation to search the data
python_view, python_indices, python_scores = vector_store.search(
embedding=query_embedding, exec_option="python"
)
# # use python implementation to search the data
# python_view, python_indices, python_scores = vector_store.search(
# embedding=query_embedding, exec_option="python"
# )

# use indra implementation to search the data
indra_view, indra_indices, indra_scores = vector_store.search(
embedding=query_embedding, exec_option="indra"
)
# # use indra implementation to search the data
# indra_view, indra_indices, indra_scores = vector_store.search(
# embedding=query_embedding, exec_option="compute_engine"
# )

np.testing.assert_almost_equal(python_indices, indra_indices)
np.testing.assert_almost_equal(python_scores, indra_scores)
# np.testing.assert_almost_equal(python_indices, indra_indices)
# np.testing.assert_almost_equal(python_scores, indra_scores)

# initialize vector store object:
vector_store = DeepLakeVectorStore(
dataset_path="hub://activeloop-test/deeplake_vector_store-test", read_only=True
dataset_path="hub://activeloop-test/deeplake_vectorstore-test1", read_only=True
)
db_engine_view, db_engine_indices, db_engine_scores = vector_store.search(
embedding=query_embedding, exec_option="db_engine"
Expand All @@ -57,6 +57,11 @@ def test_search(distance_metric):
assert len(view) == 1
assert indices == [0]

with pytest.raises(ValueError):
db_engine_view, db_engine_indices, db_engine_scores = vector_store.search(
embedding=query_embedding, exec_option="remote_db_engine"
)


def test_delete():
# initialize vector store object:
Expand All @@ -83,11 +88,12 @@ def test_delete():
assert "./deeplake_vector_store" not in dirs


def test_ingestion():
def test_ingestion(capsys):
# initialize vector store object:
vector_store = DeepLakeVectorStore(
dataset_path="./deeplake_vector_store",
overwrite=True,
verbose=True,
)

with pytest.raises(Exception):
Expand All @@ -97,6 +103,19 @@ def test_ingestion():
)

vector_store.add(embeddings=embeddings, texts=texts, ids=ids, metadatas=metadatas)
captured = capsys.readouterr()
output = (
"Dataset(path='./deeplake_vector_store', tensors=['embedding', 'ids', 'metadata', 'text'])\n\n"
" tensor htype shape dtype compression\n"
" ------- ------- ------- ------- ------- \n"
" embedding embedding (1000, 1536) float32 None \n"
" ids text (1000, 1) str None \n"
" metadata json (1000, 1) str None \n"
" text text (1000, 1) str None \n"
)

assert output in captured.out

assert len(vector_store) == 1000
assert list(vector_store.dataset.tensors.keys()) == [
"embedding",
Expand Down
17 changes: 9 additions & 8 deletions deeplake/core/vectorstore/vector_search/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
try:
from indra import api

_INDRA_INSTALLED = True
except ImportError:
_INDRA_INSTALLED = False
_INDRA_INSTALLED = True # pragma: no cover
except ImportError: # pragma: no cover
_INDRA_INSTALLED = False # pragma: no cover

import numpy as np

Expand Down Expand Up @@ -119,10 +119,10 @@ def fetch_embeddings(exec_option, view, logger):
logger.warning(
"Python implementation fetches all of the dataset's embedding into memory. "
"With big datasets this could be quite slow and potentially result in performance issues. "
"Use `exec_option = db_engine` for better performance."
"Use `exec_option = 'db_engine'` for better performance."
)
embeddings = view.embedding.numpy()
elif exec_option in ("indra", "db_engine"):
elif exec_option in ("compute_engine", "db_engine"):
embeddings = None
return embeddings

Expand All @@ -134,9 +134,10 @@ def get_embedding(embedding, query, embedding_function=None):
"Either embedding array or embedding_function should be specified!"
)

if embedding is not None and embedding_function is not None:
always_warn("both embedding and embedding_function are specified. ")
embedding = embedding_function.embed_query(query) # type: ignore
if embedding_function is not None:
if embedding is not None:
always_warn("both embedding and embedding_function are specified. ")
embedding = embedding_function(query) # type: ignore

if embedding.dtype != "float32":
embedding = np.array(embedding, dtype=np.float32)
Expand Down
48 changes: 37 additions & 11 deletions deeplake/core/vectorstore/vector_search/dataset/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,21 @@

import deeplake
from deeplake.core.vectorstore.vector_search import dataset as dataset_utils
from deeplake.constants import DEFAULT_DEEPLAKE_PATH


logger = logging.getLogger(__name__)


def test_create_or_load_dataset():
def test_create_or_load_dataset(caplog):
# dataset creation
dataset = dataset_utils.create_or_load_dataset(
dataset_path="./test-dataset",
token=None,
creds={},
logger=logger,
read_only=False,
exec_option="indra",
exec_option="compute_engine",
overwrite=True,
)
assert len(dataset) == 0
Expand All @@ -31,14 +32,34 @@ def test_create_or_load_dataset():

# dataset loading
dataset = dataset_utils.create_or_load_dataset(
dataset_path="hub://activeloop/mnist-train",
dataset_path="hub://activeloop-test/deeplake_vectorstore-test1",
token=None,
creds={},
logger=logger,
read_only=False,
exec_option="python",
overwrite=False,
read_only=True,
)
assert len(dataset) == 60000
assert len(dataset) == 10

ds = deeplake.empty(DEFAULT_DEEPLAKE_PATH, overwrite=True)

test_logger = logging.getLogger("test_logger")
with caplog.at_level(logging.WARNING, logger="test_logger"):
# dataset loading
dataset = dataset_utils.create_or_load_dataset(
dataset_path=DEFAULT_DEEPLAKE_PATH,
token=None,
creds={},
logger=test_logger,
read_only=False,
exec_option="python",
)
assert (
f"The default deeplake path location is used: {DEFAULT_DEEPLAKE_PATH}"
" and it is not free. All addtionally added data will be added on"
" top of already existing deeplake dataset." in caplog.text
)


def test_delete_and_commit():
Expand Down Expand Up @@ -79,25 +100,24 @@ def test_fetch_embeddings():
embedings = dataset_utils.fetch_embeddings("python", dataset, logger)
assert len(embedings) == 9

embedings = dataset_utils.fetch_embeddings("indra", dataset, logger)
embedings = dataset_utils.fetch_embeddings("compute_engine", dataset, logger)
assert embedings is None

embedings = dataset_utils.fetch_embeddings("db_engine", dataset, logger)
assert embedings is None


def test_get_embedding():
class EmbeddingFunc:
def embed_query(self, query):
return np.array([0.5, 0.6, 4, 3, 5], dtype=np.float64)
def embedding_function(arr):
return np.array([0.5, 0.6, 4, 3, 5], dtype=np.float64)

query = "tql query"
with pytest.raises(Exception):
embedding = dataset_utils.get_embedding(
embedding=None, query=query, embedding_function=None
)

embedding_func = EmbeddingFunc()
embedding_func = embedding_function
embedding = dataset_utils.get_embedding(
embedding=None, query=query, embedding_function=embedding_func
)
Expand All @@ -111,6 +131,11 @@ def embed_query(self, query):
assert embedding.dtype == np.float32
assert embedding.shape == (1, 1538)

with pytest.warns(UserWarning):
embedding = dataset_utils.get_embedding(
embedding=embedding_vector, query=query, embedding_function=embedding_func
)


def test_preprocess_tensors():
texts = ["a", "b", "c", "d"]
Expand All @@ -123,14 +148,15 @@ def test_preprocess_tensors():
assert processed_tensors["metadatas"] == [{}, {}, {}, {}]
assert processed_tensors["embeddings"] == [None, None, None, None]

texts = ("a", "b", "c", "d")
ids = np.array([1, 2, 3, 4])
metadatas = [{"a": 1}, {"b": 2}, {"c": 3}, {"d": 4}]
embeddings = [np.array([0.1, 0.2, 0.3, 0.4])] * len(texts)
processed_tensors = dataset_utils.preprocess_tensors(
ids=ids, texts=texts, metadatas=metadatas, embeddings=embeddings
)
assert np.array_equal(processed_tensors["ids"], ids)
assert processed_tensors["texts"] == texts
assert processed_tensors["texts"] == list(texts)
assert processed_tensors["metadatas"] == metadatas
assert processed_tensors["embeddings"] == embeddings

Expand Down
4 changes: 2 additions & 2 deletions deeplake/core/vectorstore/vector_search/filter/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@ def attribute_based_filtering(view, filter, exec_option):

view = view.filter(filter)
if len(view) == 0:
return []
raise ValueError(f"No data was found for {filter} metadata.")
return view


def filtering_exception(filter, exec_option):
if exec_option in ("indra", "db_engine") and filter is not None:
if exec_option in ("compute_engine", "db_engine") and filter is not None:
case_specific_exception = ""
if "db_engine":
case_specific_exception += "To run filtering set `remote_db=False`."
Expand Down
9 changes: 7 additions & 2 deletions deeplake/core/vectorstore/vector_search/filter/test_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@ def test_attribute_based_filtering():
view = deeplake.empty("mem://deeplake_test")
view.create_tensor("metadata", htype="json")
view.metadata.extend([{"abcd": 1}, {"abcd123": 2}, {"abcd32": 3}, {"abcrd": 4}])
exec_otion = "indra"
exec_otion = "compute_engine"
filter_dict = {"abcd": 1}

with pytest.raises(NotImplementedError):
view = filter_utils.attribute_based_filtering(
view, filter=filter_dict, exec_option="indra"
view, filter=filter_dict, exec_option="compute_engine"
)

with pytest.raises(NotImplementedError):
Expand All @@ -27,6 +27,11 @@ def test_attribute_based_filtering():

assert view.metadata.data()["value"][0] == filter_dict

with pytest.raises(ValueError):
view = filter_utils.attribute_based_filtering(
view, filter={"aaaccc": 2}, exec_option="python"
)


def test_exact_text_search():
view = deeplake.empty("mem://deeplake_test")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,22 @@ def run_data_ingestion(
"""
batch_size = min(ingestion_batch_size, len(elements))
if batch_size == 0:
return []
raise ValueError("batch_size must be a positive number greater than zero.")

batched = [
elements[i : i + batch_size] for i in range(0, len(elements), batch_size)
]

num_workers = min(num_workers, len(batched) // max(num_workers, 1))
checkpoint_interval = int(
(0.1 * len(batched) // max(num_workers, 1)) * max(num_workers, 1)
)

ingest(embedding_function=embedding_function).eval(
batched,
dataset,
num_workers=min(num_workers, len(batched) // max(num_workers, 1)),
num_workers=num_workers,
checkpoint_interval=checkpoint_interval,
)


Expand Down
Loading

0 comments on commit e4682ee

Please sign in to comment.