diff --git a/deeplake/constants.py b/deeplake/constants.py index e1609bbd91..e87a8d0ec0 100644 --- a/deeplake/constants.py +++ b/deeplake/constants.py @@ -183,3 +183,7 @@ TRANSFORM_RECHUNK_AVG_SIZE_BOUND = 0.1 TIME_INTERVAL_FOR_CUDA_MEMORY_CLEANING = 10 * 60 + +# Transform cache sizes +DEFAULT_TRANSFORM_SAMPLE_CACHE_SIZE = 16 +TRANSFORM_CHUNK_CACHE_SIZE = 64 * MB diff --git a/deeplake/core/chunk_engine.py b/deeplake/core/chunk_engine.py index b8ca69c07c..835afff572 100644 --- a/deeplake/core/chunk_engine.py +++ b/deeplake/core/chunk_engine.py @@ -2721,6 +2721,10 @@ def get_avg_chunk_size(self): dtype = self.tensor_meta.dtype if dtype in ("Any", "List", None): return None - nbytes = np.prod([num_samples] + max_shape) * np.dtype(dtype).itemsize + shape = [num_samples] + max_shape + nbytes = 1 + for dim in shape: # not using np.prod to avoid overflow + nbytes *= dim + nbytes = nbytes * np.dtype(dtype).itemsize avg_chunk_size = nbytes / num_chunks return avg_chunk_size diff --git a/deeplake/core/transform/transform.py b/deeplake/core/transform/transform.py index 358459edb5..d9b0399511 100644 --- a/deeplake/core/transform/transform.py +++ b/deeplake/core/transform/transform.py @@ -34,6 +34,7 @@ from deeplake.hooks import dataset_written, dataset_read from deeplake.util.version_control import auto_checkout from deeplake.util.class_label import sync_labels +from deeplake.constants import DEFAULT_TRANSFORM_SAMPLE_CACHE_SIZE import posixpath @@ -57,7 +58,7 @@ def eval( check_lengths: bool = True, pad_data_in: bool = False, read_only_ok: bool = False, - cache_size: int = 16, + cache_size: int = DEFAULT_TRANSFORM_SAMPLE_CACHE_SIZE, checkpoint_interval: int = 0, ignore_errors: bool = False, **kwargs, @@ -135,7 +136,7 @@ def eval( check_lengths: bool = True, pad_data_in: bool = False, read_only_ok: bool = False, - cache_size: int = 16, + cache_size: int = DEFAULT_TRANSFORM_SAMPLE_CACHE_SIZE, checkpoint_interval: int = 0, ignore_errors: bool = False, **kwargs, diff --git a/deeplake/util/transform.py b/deeplake/util/transform.py index db4142d846..f69dac1225 100644 --- a/deeplake/util/transform.py +++ b/deeplake/util/transform.py @@ -16,6 +16,7 @@ MB, TRANSFORM_PROGRESSBAR_UPDATE_INTERVAL, TRANSFORM_RECHUNK_AVG_SIZE_BOUND, + TRANSFORM_CHUNK_CACHE_SIZE, ) from deeplake.util.dataset import try_flushing from deeplake.util.remove_cache import ( @@ -339,19 +340,18 @@ def create_worker_chunk_engines( """ all_chunk_engines: Dict[str, ChunkEngine] = {} num_tries = 1000 + storage_cache = LRUCache(MemoryProvider(), output_storage, TRANSFORM_CHUNK_CACHE_SIZE) + storage_cache.autoflush = False + # TODO: replace this with simply a MemoryProvider once we get rid of cachable + memory_cache = LRUCache( + MemoryProvider(), + MemoryProvider(), + 64 * MB, + ) + memory_cache.autoflush = False for tensor in tensors: for i in range(num_tries): try: - # TODO: replace this with simply a MemoryProvider once we get rid of cachable - memory_cache = LRUCache( - MemoryProvider(), - MemoryProvider(), - 64 * MB, - ) - memory_cache.autoflush = False - storage_cache = LRUCache(MemoryProvider(), output_storage, 64 * MB) - storage_cache.autoflush = False - # this chunk engine is used to retrieve actual tensor meta and chunk_size storage_chunk_engine = ChunkEngine(tensor, storage_cache, version_state) existing_meta = storage_chunk_engine.tensor_meta