Support paddlenlp embedding

Signed-off-by: vax521 <13263397018@163.com> Signed-off-by: feimeng <13263397018@163.com>
zilliztech · May 22, 2023 · dff1c77 · dff1c77
1 parent a2b7466
commit dff1c77
Show file tree

Hide file tree

Showing 6 changed files with 153 additions and 2 deletions.
diff --git a/examples/README.md b/examples/README.md
@@ -136,6 +136,20 @@ fast_text = FastText()
 
 </details>
 
+<details>
+
+<summary> PaddleNLP </summary>
+
+```python
+from gptcache.embedding import PaddleNLP
+
+paddlenlp = PaddleNLP()
+# paddlenlp.dimension
+# paddlenlp.to_embeddings
+```
+
+</details>
+
 ### Custom embedding
 
 The function has two parameters: the preprocessed string and parameters reserved for user customization. To acquire these parameters, a similar method to the one above is used: `kwargs.get("embedding_func", {})`.

diff --git a/examples/embedding/paddlenlp.py b/examples/embedding/paddlenlp.py
@@ -0,0 +1,32 @@
+from gptcache.adapter import openai
+from gptcache import cache
+from gptcache.manager.factory import get_data_manager
+from gptcache.manager import get_data_manager, CacheBase, VectorBase
+from gptcache.similarity_evaluation.distance import SearchDistanceEvaluation
+from gptcache.embedding import PaddleNLP
+
+
+def run():
+    paddlenlp = PaddleNLP()
+
+    cache_base = CacheBase('sqlite')
+    vector_base = VectorBase('faiss', dimension=paddlenlp.dimension)
+    data_manager = get_data_manager(cache_base, vector_base)
+
+    cache.init(embedding_func=paddlenlp.to_embeddings,
+               data_manager=data_manager,
+               similarity_evaluation=SearchDistanceEvaluation(),
+               )
+    cache.set_openai_key()
+
+    answer = openai.ChatCompletion.create(
+        model='gpt-3.5-turbo',
+        messages=[
+            {'role': 'user', 'content': 'what is chatgpt'}
+        ],
+    )
+    print(answer)
+
+
+if __name__ == '__main__':
+    run()
diff --git a/gptcache/embedding/__init__.py b/gptcache/embedding/__init__.py
@@ -10,6 +10,7 @@
     "ViT",
     "LangChain",
     "Rwkv",
+    "PaddleNLP",
 ]
 
 
@@ -26,6 +27,7 @@
 vit = LazyImport("vit", globals(), "gptcache.embedding.vit")
 langchain = LazyImport("langchain", globals(), "gptcache.embedding.langchain")
 rwkv = LazyImport("rwkv", globals(), "gptcache.embedding.rwkv")
+paddlenlp = LazyImport("paddlenlp", globals(), "gptcache.embedding.paddlenlp")
 
 
 def Cohere(model="large", api_key=None):
@@ -70,3 +72,7 @@ def LangChain(embeddings, dimension=0):
 
 def Rwkv(model="sgugger/rwkv-430M-pile"):
     return rwkv.Rwkv(model)
+
+
+def PaddleNLP(model="ernie-3.0-medium-zh"):
+    return paddlenlp.PaddleNLP(model)
diff --git a/gptcache/embedding/paddlenlp.py b/gptcache/embedding/paddlenlp.py
@@ -0,0 +1,76 @@
+import numpy as np
+
+from gptcache.utils import import_paddlenlp,import_paddle
+from gptcache.embedding.base import BaseEmbedding
+
+import_paddle()
+import_paddlenlp()
+
+
+import paddle # pylint: disable=C0413
+from paddlenlp.transformers import AutoModel,AutoTokenizer # pylint: disable=C0413
+
+class PaddleNLP(BaseEmbedding):
+    """Generate sentence embedding for given text using pretrained models from PaddleNLP transformers.
+
+    :param model: model name, defaults to 'ernie-3.0-medium-zh'.
+    :type model: str
+
+    Example:
+        .. code-block:: python
+
+            from gptcache.embedding import PaddleNLP
+
+            test_sentence = 'Hello, world.'
+            encoder = PaddleNLP(model='ernie-3.0-medium-zh')
+            embed = encoder.to_embeddings(test_sentence)
+    """
+
+    def __init__(self, model: str = "ernie-3.0-medium-zh"):
+        self.model = AutoModel.from_pretrained(model)
+        self.model.eval()
+
+        self.tokenizer = AutoTokenizer.from_pretrained(model)
+        if not self.tokenizer.pad_token:
+            self.tokenizer.pad_token = "<pad>"
+        self.__dimension = None
+
+
+    def to_embeddings(self, data, **_):
+        """Generate embedding given text input
+
+        :param data: text in string.
+        :type data: str
+
+        :return: a text embedding in shape of (dim,).
+        """
+        if not isinstance(data, list):
+            data = [data]
+        inputs = self.tokenizer(
+            data, padding=True, truncation=True, return_tensors="pd"
+        )
+        outs = self.model(**inputs)[0]
+        emb = self.post_proc(outs, inputs).squeeze(0).detach().numpy()
+        return np.array(emb).astype("float32")
+
+    def post_proc(self, token_embeddings, inputs):
+        attention_mask = paddle.ones(inputs["token_type_ids"].shape)
+        input_mask_expanded = (
+            attention_mask.unsqueeze(-1).expand(token_embeddings.shape).astype("float32")
+        )
+        sentence_embs = paddle.sum(
+            token_embeddings * input_mask_expanded, 1
+        ) / paddle.clip(input_mask_expanded.sum(1), min=1e-9)
+        return sentence_embs
+
+
+    @property
+    def dimension(self):
+        """Embedding dimension.
+
+        :return: embedding dimension
+        """
+        if not self.__dimension:
+            self.__dimension = len(self.to_embeddings("foo"))
+        return self.__dimension
+
diff --git a/gptcache/utils/__init__.py b/gptcache/utils/__init__.py
@@ -31,9 +31,10 @@
     "import_httpx",
     "import_openai",
     "import_docarray",
-
     "softmax",
-]
+    "import_paddle",
+    "import_paddlenlp"
+    ]
 
 import importlib.util
 from typing import Optional
@@ -210,3 +211,11 @@ def import_openai():
 
 def import_docarray():
     _check_library("docarray")
+
+
+def import_paddle():
+    _check_library("paddlepaddle", package="paddlepaddle==2.4.0")
+
+
+def import_paddlenlp():
+    _check_library("paddlenlp")
diff --git a/tests/unit_tests/embedding/test_paddlenlp.py b/tests/unit_tests/embedding/test_paddlenlp.py
@@ -0,0 +1,14 @@
+from gptcache.embedding import PaddleNLP
+from gptcache.adapter.api import _get_model
+
+
+def test_paddlenlp():
+    t = PaddleNLP("ernie-3.0-nano-zh")
+    dimension = t.dimension
+    data = t.to_embeddings("中国")
+    assert len(data) == dimension, f"{len(data)}, {t.dimension}"
+
+    t = _get_model(model_src="paddlenlp", model_config={"model": "ernie-3.0-nano-zh"})
+    dimension = t.dimension
+    data = t.to_embeddings("中国")
+    assert len(data) == dimension, f"{len(data)}, {t.dimension}"