Improve the usage example

Signed-off-by: SimFG <bang.fu@zilliz.com>
zilliztech · Mar 31, 2023 · 453ac0f · 453ac0f
1 parent 6df0846
commit 453ac0f
Show file tree

Hide file tree

Showing 7 changed files with 63 additions and 39 deletions.
diff --git a/.gitignore b/.gitignore
@@ -129,9 +129,8 @@ dmypy.json
 .pyre/
 
 .idea
-**/data_map.txt
 **/data_map**.txt
-**/faiss.index
-**/sqlite.db
+**/faiss**.index
+**/sqlite**.db
 **/example.py
 **/example.db
diff --git a/example/benchmark/benchmark_sf_towhee.py b/example/benchmark/benchmark_sf_towhee.py
@@ -1,4 +1,5 @@
 import json
+import os
 import time
 
 from gpt_cache.view import openai
@@ -23,6 +24,10 @@ def sf_evaluation(src_dict, cache_dict, **kwargs):
             return rank2 if rank2 != 0 else 1
         return 0
 
+    sqlite_file = "sqlite.db"
+    faiss_file = "faiss.index"
+    has_data = os.path.isfile(sqlite_file) and os.path.isfile(faiss_file)
+
     data_manager = get_si_data_manager("sqlite", "faiss", dimension=embedding_towhee.dimension(), max_size=100000)
     cache.init(embedding_func=embedding_towhee.to_embeddings,
                data_manager=data_manager,
@@ -36,15 +41,15 @@ def sf_evaluation(src_dict, cache_dict, **kwargs):
         pair["id"] = str(i)
         i += 1
 
-    # you should CLOSE it if you SECONDLY run it
-    print("insert data")
-    id_origin = {}
-    for pair in mock_data:
-        question = pair["origin"]
-        answer = pair["id"]
-        id_origin[answer] = question
-        cache.data_manager.save(question, answer, cache.embedding_func(question))
-    print("end insert data")
+    if not has_data:
+        print("insert data")
+        id_origin = {}
+        for pair in mock_data:
+            question = pair["origin"]
+            answer = pair["id"]
+            id_origin[answer] = question
+            cache.data_manager.save(question, answer, cache.embedding_func(question))
+        print("end insert data")
 
     all_time = 0.0
     hit_cache_positive, hit_cache_negative = 0, 0

diff --git a/example/map/map_manager.py b/example/map/map_manager.py
@@ -8,27 +8,30 @@
 def run():
     dirname, _ = os.path.split(os.path.abspath(__file__))
     bak_cache = Cache()
+    bak_data_file = dirname + "/data_map_bak.txt"
     bak_cache.init(data_manager=get_data_manager("map",
-                                                 data_path=dirname + "/data_map_bak.txt",
+                                                 data_path=bak_data_file,
                                                  max_size=10))
+    data_file = dirname + "/data_map.txt"
     cache.init(data_manager=get_data_manager("map",
-                                             data_path=dirname + "/data_map.txt",
+                                             data_path=data_file,
                                              max_size=10),
                next_cache=bak_cache)
     mock_messages = [
         {"role": "system", "content": "You are a helpful assistant."},
         {"role": "user", "content": "foo15"}
     ]
 
-    # you should CLOSE it if you SECONDLY run it
-    for i in range(10):
-        question = f"foo{i}"
-        answer = f"receiver the foo {i}"
-        cache.data_manager.save(question, answer, cache.embedding_func(question))
-    for i in range(10, 20):
-        question = f"foo{i}"
-        answer = f"receiver the foo {i}"
-        bak_cache.data_manager.save(question, answer, bak_cache.embedding_func(question))
+    if not os.path.isfile(bak_data_file):
+        for i in range(10):
+            question = f"foo{i}"
+            answer = f"receiver the foo {i}"
+            cache.data_manager.save(question, answer, cache.embedding_func(question))
+    if not os.path.isfile(data_file):
+        for i in range(10, 20):
+            question = f"foo{i}"
+            answer = f"receiver the foo {i}"
+            bak_cache.data_manager.save(question, answer, bak_cache.embedding_func(question))
 
     answer = openai.ChatCompletion.create(
         model="gpt-3.5-turbo",

diff --git a/example/sf_mock/sf_manager.py b/example/sf_mock/sf_manager.py
@@ -1,3 +1,5 @@
+import os
+
 from gpt_cache.view import openai
 from gpt_cache.core import cache, Config
 from gpt_cache.cache.factory import get_si_data_manager
@@ -13,7 +15,11 @@ def mock_embeddings(data, **kwargs):
 
 
 def run():
-    data_manager = get_si_data_manager("sqlite", "faiss", dimension=d, max_size=8, clean_size=2, top_k=3)
+    sqlite_file = "sqlite.db"
+    faiss_file = "faiss.index"
+    has_data = os.path.isfile(sqlite_file) and os.path.isfile(faiss_file)
+    data_manager = get_si_data_manager("sqlite", "faiss",
+                                       dimension=d, max_size=8, clean_size=2, top_k=3)
     cache.init(embedding_func=mock_embeddings,
                data_manager=data_manager,
                evaluation_func=pair_evaluation,
@@ -26,11 +32,11 @@ def run():
         {"role": "system", "content": "You are a helpful assistant."},
         {"role": "user", "content": "foo"}
     ]
-    # you should CLOSE it if you SECONDLY run it
-    for i in range(10):
-        question = f"foo{i}"
-        answer = f"receiver the foo {i}"
-        cache.data_manager.save(question, answer, cache.embedding_func(question))
+    if not has_data:
+        for i in range(10):
+            question = f"foo{i}"
+            answer = f"receiver the foo {i}"
+            cache.data_manager.save(question, answer, cache.embedding_func(question))
 
     answer = openai.ChatCompletion.create(
         model="gpt-3.5-turbo",

diff --git a/example/sf_towhee/sf_manager.py b/example/sf_towhee/sf_manager.py
@@ -1,3 +1,4 @@
+import os
 import time
 
 from gpt_cache.view import openai
@@ -9,17 +10,21 @@
 
 def run():
     towhee = Towhee()
-    data_manager = get_si_data_manager("sqlite", "faiss", dimension=towhee.dimension(), max_size=2000)
+    sqlite_file = "sqlite.db"
+    faiss_file = "faiss.index"
+    has_data = os.path.isfile(sqlite_file) and os.path.isfile(faiss_file)
+    data_manager = get_si_data_manager("sqlite", "faiss",
+                                       dimension=towhee.dimension(), max_size=2000)
     cache.init(embedding_func=towhee.to_embeddings,
                data_manager=data_manager,
                evaluation_func=pair_evaluation,
                similarity_threshold=10000,
                similarity_positive=False)
 
-    # you should CLOSE it if you SECONDLY run it
-    question = "what do you think about chatgpt"
-    answer = "chatgpt is a good application"
-    cache.data_manager.save(question, answer, cache.embedding_func(question))
+    if not has_data:
+        question = "what do you think about chatgpt"
+        answer = "chatgpt is a good application"
+        cache.data_manager.save(question, answer, cache.embedding_func(question))
 
     # distance 77
     mock_messages = [

diff --git a/example/sqlite_milvus_mock/sqlite_milvus_mock.py b/example/sqlite_milvus_mock/sqlite_milvus_mock.py
@@ -1,3 +1,5 @@
+import os
+
 from gpt_cache.view import openai
 from gpt_cache.core import cache, Config
 from gpt_cache.cache.factory import get_ss_data_manager
@@ -13,6 +15,8 @@ def mock_embeddings(data, **kwargs):
 
 
 def run():
+    sqlite_file = "sqlite.db"
+    has_data = os.path.isfile(sqlite_file)
     # milvus
     data_manager = get_ss_data_manager("sqlite", "milvus", dimension=d, max_size=8, clean_size=2)
     # zilliz cloud
@@ -33,11 +37,11 @@ def run():
         {"role": "system", "content": "You are a helpful assistant."},
         {"role": "user", "content": "foo"}
     ]
-    # you should CLOSE it if you SECONDLY run it
-    for i in range(10):
-        question = f"foo{i}"
-        answer = f"receiver the foo {i}"
-        cache.data_manager.save(question, answer, cache.embedding_func(question))
+    if not has_data:
+        for i in range(10):
+            question = f"foo{i}"
+            answer = f"receiver the foo {i}"
+            cache.data_manager.save(question, answer, cache.embedding_func(question))
 
     answer = openai.ChatCompletion.create(
         model="gpt-3.5-turbo",

diff --git a/gpt_cache/cache/vector_data/faiss.py b/gpt_cache/cache/vector_data/faiss.py
@@ -30,6 +30,8 @@ def mult_add(self, datas):
         self.index.add(np_data)
 
     def search(self, data):
+        if self.index.ntotal == 0:
+            return None
         np_data = np.array(data).astype('float32').reshape(1, -1)
         D, I = self.index.search(np_data, self.top_k)
         distances = []