Skip to content

Commit

Permalink
Adressed comments
Browse files Browse the repository at this point in the history
  • Loading branch information
adolkhan committed May 10, 2023
1 parent 95750d3 commit 9daab71
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 19 deletions.
20 changes: 11 additions & 9 deletions deeplake/core/vectorstore/deeplake_vectorstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def __init__(
ingestion_batch_size: int = 1024,
num_workers: int = 0,
exec_option: str = "python",
verbose=False,
**kwargs: Any,
) -> None:
"""DeepLakeVectorStore initialization
Expand All @@ -55,13 +56,14 @@ def __init__(
)
self.embedding_function = embedding_function
self._exec_option = exec_option
self.verbose = verbose

def add(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
embeddings: Optional[Union[List[float], np.ndarray]] = None,
embeddings: Optional[np.ndarray] = None,
verbose: Optional[bool] = False,
) -> List[str]:
"""Adding elements to deeplake vector store
Expand All @@ -70,7 +72,7 @@ def add(
texts (Iterable[str]): texts to add to deeplake vector store
metadatas (Optional[List[dict]], optional): List of metadatas.. Defaults to None.
ids (Optional[List[str]], optional): List of document IDs. Defaults to None.
embeddings (Optional[Union[List[float], np.ndarray]]): embedding of texts. Defaults to None.
embeddings (Optional[np.ndarray): embedding of texts. Defaults to None.
Returns:
ids (List[str]): List of document IDs
"""
Expand All @@ -83,14 +85,14 @@ def add(
num_workers=self.num_workers,
)
self.dataset.commit(allow_empty=True)
if verbose:
if self.verbose:
self.dataset.summary()
return ids

def search(
self,
query: Optional[str] = None,
embedding: Optional[Union[List[float], np.ndarray]] = None,
embedding: Optional[np.ndarray] = None,
k: int = 4,
distance_metric: str = "L2",
filter: Optional[Any] = None,
Expand All @@ -100,7 +102,7 @@ def search(
Args:
query (Optional[str], optional): String representation of the query to run. Defaults to None.
embedding (Optional[Union[List[float], np.ndarray]], optional): Embedding representation of the query to run. Defaults to None.
embedding (Optional[np.ndarray, optional): Embedding representation of the query to run. Defaults to None.
k (int, optional): Number of elements to return after running query. Defaults to 4.
distance_metric (str, optional): Type of distance metric to use for sorting the data. Avaliable options are: "L1", "L2", "COS", "MAX". Defaults to "L2".
filter (Optional[Any], optional): Metadata dictionary for exact search. Defaults to None.
Expand Down Expand Up @@ -135,21 +137,21 @@ def search(
def _search(
self,
view,
exec_option: bool,
exec_option: str,
embedding: Optional[Union[List[float], np.ndarray]] = None,
query: Optional[str] = None,
k: Optional[int] = 4,
k: int = 4,
distance_metric: Optional[str] = "L2",
):
"""Internal DeepLakeVectorStore search method
Args:
query (Optional[str], optional): String representation of the query to run. Defaults to None.
embedding (Optional[Union[List[float], np.ndarray]], optional): Embedding representation of the query to run. Defaults to None.
k (int, optional): Number of elements to return after running query. Defaults to 4.
k (int): Number of elements to return after running query. Defaults to 4.
distance_metric (str, optional): Type of distance metric to use for sorting the data. Avaliable options are: "L1", "L2", "COS", "MAX". Defaults to "L2".
filter (Optional[Any], optional): Metadata dictionary for exact search. Defaults to None.
exec_option (Optional[str], optional): Type of query execution. It could be either "python", "indra" or "db_engine". Defaults to None.
exec_option (str): Type of query execution. It could be either "python", "indra" or "db_engine". Defaults to None.
Returns:
tuple (view, indices, scores): View is the dataset view generated from the queried samples, indices are the indices of the ordered samples, scores are respectively the scores of the ordered samples
Expand Down
6 changes: 3 additions & 3 deletions deeplake/core/vectorstore/test_deeplake_vectorstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,8 @@ def test_ingestion():
vector_store.add(embeddings=embeddings, texts=texts, ids=ids, metadatas=metadatas)
assert len(vector_store) == 1000
assert list(vector_store.dataset.tensors.keys()) == [
"embeddings",
"embedding",
"ids",
"metadatas",
"texts",
"metadata",
"text",
]
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@


def vector_search(
query_embedding: Optional[Union[List[float], np.ndarray]],
query_embedding: Optional[np.ndarray],
distance_metric: str,
deeplake_dataset: DeepLakeDataset,
k: int,
Expand All @@ -24,7 +24,7 @@ def vector_search(
embedding_tensor (str): name of the tensor in the dataset with `htype = "embedding"`.
**kwargs (Any): Any additional parameters.
"""
from indra import api
from indra import api # type: ignore

tql_query = query.parse_query(distance_metric, k, query_embedding, embedding_tensor)
indra_ds = api.dataset(deeplake_dataset.path)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,24 +33,29 @@ def run_data_ingestion(
elements[i : i + batch_size] for i in range(0, len(elements), batch_size)
]

ingest(_embedding_function=embedding_function).eval(
ingest(embedding_function=embedding_function).eval(
batched,
dataset,
num_workers=min(num_workers, len(batched) // max(num_workers, 1)),
)


@deeplake.compute
def ingest(sample_in: list, sample_out: list, _embedding_function) -> None:
def ingest(sample_in: list, sample_out: list, embedding_function) -> None:
text_list = [s["text"] for s in sample_in]

embeds = [None] * len(text_list)
if _embedding_function is not None:
embeddings = _embedding_function.embed_documents(text_list)
if embedding_function is not None:
try:
embeddings = embedding_function(text_list)
except Exception as e:
raise Exception(
"Could not use embedding function. Please try again with a different embedding function."
)
embeds = [np.array(e, dtype=np.float32) for e in embeddings]

for s, e in zip(sample_in, embeds):
embedding = e if _embedding_function else s["embedding"]
embedding = e if embedding_function else s["embedding"]
sample_out.append(
{
"text": s["text"],
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,20 @@
import numpy as np

import random

import deeplake
from deeplake.constants import MB
from deeplake.core.vectorstore.vector_search.ingestion import data_ingestion

random.seed(1)


def corrupted_embedding_function(emb):
p = random.uniform(0, 1)
if p > 0.9:
raise Exception("CorruptedEmbeddingFunction")
return np.zeros((1, 1536), dtype=np.float32)


def test_data_ingestion():
data = [
Expand Down Expand Up @@ -77,3 +88,11 @@ def test_data_ingestion():
)

assert len(dataset) == 4
extended_data = data * 10
data_ingestion.run_data_ingestion(
dataset=dataset,
elements=extended_data,
embedding_function=corrupted_embedding_function,
ingestion_batch_size=1,
num_workers=2,
)

0 comments on commit 9daab71

Please sign in to comment.