Skip to content

Commit

Permalink
Support paddlenlp embedding
Browse files Browse the repository at this point in the history
Signed-off-by: vax521 <13263397018@163.com>
Signed-off-by: feimeng <13263397018@163.com>
  • Loading branch information
vax521 authored and SimFG committed May 22, 2023
1 parent a2b7466 commit dff1c77
Show file tree
Hide file tree
Showing 6 changed files with 153 additions and 2 deletions.
14 changes: 14 additions & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,20 @@ fast_text = FastText()

</details>

<details>

<summary> PaddleNLP </summary>

```python
from gptcache.embedding import PaddleNLP

paddlenlp = PaddleNLP()
# paddlenlp.dimension
# paddlenlp.to_embeddings
```

</details>

### Custom embedding

The function has two parameters: the preprocessed string and parameters reserved for user customization. To acquire these parameters, a similar method to the one above is used: `kwargs.get("embedding_func", {})`.
Expand Down
32 changes: 32 additions & 0 deletions examples/embedding/paddlenlp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from gptcache.adapter import openai
from gptcache import cache
from gptcache.manager.factory import get_data_manager
from gptcache.manager import get_data_manager, CacheBase, VectorBase
from gptcache.similarity_evaluation.distance import SearchDistanceEvaluation
from gptcache.embedding import PaddleNLP


def run():
paddlenlp = PaddleNLP()

cache_base = CacheBase('sqlite')
vector_base = VectorBase('faiss', dimension=paddlenlp.dimension)
data_manager = get_data_manager(cache_base, vector_base)

cache.init(embedding_func=paddlenlp.to_embeddings,
data_manager=data_manager,
similarity_evaluation=SearchDistanceEvaluation(),
)
cache.set_openai_key()

answer = openai.ChatCompletion.create(
model='gpt-3.5-turbo',
messages=[
{'role': 'user', 'content': 'what is chatgpt'}
],
)
print(answer)


if __name__ == '__main__':
run()
6 changes: 6 additions & 0 deletions gptcache/embedding/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
"ViT",
"LangChain",
"Rwkv",
"PaddleNLP",
]


Expand All @@ -26,6 +27,7 @@
vit = LazyImport("vit", globals(), "gptcache.embedding.vit")
langchain = LazyImport("langchain", globals(), "gptcache.embedding.langchain")
rwkv = LazyImport("rwkv", globals(), "gptcache.embedding.rwkv")
paddlenlp = LazyImport("paddlenlp", globals(), "gptcache.embedding.paddlenlp")


def Cohere(model="large", api_key=None):
Expand Down Expand Up @@ -70,3 +72,7 @@ def LangChain(embeddings, dimension=0):

def Rwkv(model="sgugger/rwkv-430M-pile"):
return rwkv.Rwkv(model)


def PaddleNLP(model="ernie-3.0-medium-zh"):
return paddlenlp.PaddleNLP(model)
76 changes: 76 additions & 0 deletions gptcache/embedding/paddlenlp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import numpy as np

from gptcache.utils import import_paddlenlp,import_paddle
from gptcache.embedding.base import BaseEmbedding

import_paddle()
import_paddlenlp()


import paddle # pylint: disable=C0413
from paddlenlp.transformers import AutoModel,AutoTokenizer # pylint: disable=C0413

class PaddleNLP(BaseEmbedding):
"""Generate sentence embedding for given text using pretrained models from PaddleNLP transformers.
:param model: model name, defaults to 'ernie-3.0-medium-zh'.
:type model: str
Example:
.. code-block:: python
from gptcache.embedding import PaddleNLP
test_sentence = 'Hello, world.'
encoder = PaddleNLP(model='ernie-3.0-medium-zh')
embed = encoder.to_embeddings(test_sentence)
"""

def __init__(self, model: str = "ernie-3.0-medium-zh"):
self.model = AutoModel.from_pretrained(model)
self.model.eval()

self.tokenizer = AutoTokenizer.from_pretrained(model)
if not self.tokenizer.pad_token:
self.tokenizer.pad_token = "<pad>"
self.__dimension = None


def to_embeddings(self, data, **_):
"""Generate embedding given text input
:param data: text in string.
:type data: str
:return: a text embedding in shape of (dim,).
"""
if not isinstance(data, list):
data = [data]
inputs = self.tokenizer(
data, padding=True, truncation=True, return_tensors="pd"
)
outs = self.model(**inputs)[0]
emb = self.post_proc(outs, inputs).squeeze(0).detach().numpy()
return np.array(emb).astype("float32")

def post_proc(self, token_embeddings, inputs):
attention_mask = paddle.ones(inputs["token_type_ids"].shape)
input_mask_expanded = (
attention_mask.unsqueeze(-1).expand(token_embeddings.shape).astype("float32")
)
sentence_embs = paddle.sum(
token_embeddings * input_mask_expanded, 1
) / paddle.clip(input_mask_expanded.sum(1), min=1e-9)
return sentence_embs


@property
def dimension(self):
"""Embedding dimension.
:return: embedding dimension
"""
if not self.__dimension:
self.__dimension = len(self.to_embeddings("foo"))
return self.__dimension

13 changes: 11 additions & 2 deletions gptcache/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,10 @@
"import_httpx",
"import_openai",
"import_docarray",

"softmax",
]
"import_paddle",
"import_paddlenlp"
]

import importlib.util
from typing import Optional
Expand Down Expand Up @@ -210,3 +211,11 @@ def import_openai():

def import_docarray():
_check_library("docarray")


def import_paddle():
_check_library("paddlepaddle", package="paddlepaddle==2.4.0")


def import_paddlenlp():
_check_library("paddlenlp")
14 changes: 14 additions & 0 deletions tests/unit_tests/embedding/test_paddlenlp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from gptcache.embedding import PaddleNLP
from gptcache.adapter.api import _get_model


def test_paddlenlp():
t = PaddleNLP("ernie-3.0-nano-zh")
dimension = t.dimension
data = t.to_embeddings("中国")
assert len(data) == dimension, f"{len(data)}, {t.dimension}"

t = _get_model(model_src="paddlenlp", model_config={"model": "ernie-3.0-nano-zh"})
dimension = t.dimension
data = t.to_embeddings("中国")
assert len(data) == dimension, f"{len(data)}, {t.dimension}"

0 comments on commit dff1c77

Please sign in to comment.