Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SummarizationContextProcess #316

Merged
merged 1 commit into from
May 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions examples/context_examples/summarization_context.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import os
import time

from gptcache.manager import get_data_manager, VectorBase
from gptcache import cache, Cache
from gptcache.embedding import Onnx
from gptcache.similarity_evaluation.distance import SearchDistanceEvaluation
from gptcache.adapter import openai
from gptcache.processor.context.summarization_context import SummarizationContextProcess
from gptcache.utils import import_huggingface
from gptcache.manager import manager_factory
import_huggingface()
import transformers # pylint: disable=C0413

def response_text(openai_resp):
return openai_resp['choices'][0]['message']['content']


def cache_init():
onnx = Onnx()
summarizer = transformers.pipeline("summarization", model="facebook/bart-large-cnn")
context_process = SummarizationContextProcess(summarizer, None, 512)
data_manager = manager_factory("sqlite,faiss", vector_params={"dimension": onnx.dimension})
dir_name, _ = os.path.split(os.path.abspath(__file__))
cache.init(
pre_embedding_func=context_process.pre_process,
embedding_func=onnx.to_embeddings,
data_manager=data_manager,
similarity_evaluation=SearchDistanceEvaluation(),
)
os.environ['OPENAI_API_KEY'] = 'API KEY'
cache.set_openai_key()

def base_request():
cache_init()
for _ in range(2):
start_time = time.time()
response = openai.ChatCompletion.create(
model='gpt-3.5-turbo',
messages=[
{
'role': 'user',
'content': 'Can you give me some tips for staying focused while working from home?'
},
{
'role': 'system',
'content': 'Sure! Here are some tips: create a designated workspace, set a schedule, take breaks, minimize distractions, and practice good time management.'
},
{
'role': 'user',
'content': 'Those are all great suggestions. Do you have any tips for maintaining a healthy work-life balance while working from home?'
},
{
'role': 'system',
'content': 'Definitely! Setting clear boundaries between work and personal time, scheduling regular breaks throughout the day, and finding ways to disconnect from work after hours can help. Additionally, make time for hobbies and other activities you enjoy outside of work to help you relax and recharge.'
},
{
'role': 'user',
'content': 'can you give meore tips?'
}
],
temperature=0,
)
print('Time consuming: {:.2f}s'.format(time.time() - start_time))
print(f'Received: {response_text(response)}')

if __name__ == '__main__':
base_request()
60 changes: 60 additions & 0 deletions gptcache/processor/context/summarization_context.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from typing import Dict, Any
from gptcache.processor import ContextProcess
from gptcache.utils import import_huggingface
import numpy as np
import_huggingface()

import transformers # pylint: disable=C0413

class SummarizationContextProcess(ContextProcess):
"""A context processor for summarizing large amounts of text data using a summarizer model.

:param summarizer: The summarizer model to use for summarization.
:type summarizer: transformers.PreTrainedModel :param tokenizer: The tokenizer to use for tokenizing the text data.
It used for measuring the output length.
:type tokenizer: transformers.PreTrainedTokenizer
:param target_length: The length of the summarized text.
:type target_length: int
"""
def __init__(self, summarizer, tokenizer=None, target_length=512):
self.summarizer = summarizer
self.target_length = target_length
if tokenizer is None:
tokenizer = transformers.RobertaTokenizer.from_pretrained('roberta-base')
self.tokenizer = tokenizer

def summarize_to_sentence(self, summarizer, sentences, target_size = 1000):
lengths = []
for sentence in sentences:
lengths.append(len(sentence))
total_length = np.array(lengths).sum()
target_lengths = [int(target_size * l / total_length) for l in lengths]
target_sentences = []
for sent, target_len in zip(sentences, target_lengths):
if len(self.tokenizer.tokenize(sent)) > target_len:
response = summarizer(sent, max_length=target_len, min_length=1, do_sample=False)
target_sentence = response[0]['summary_text']
else:
target_sentence = sent
target_sentences.append(target_sentence)
result = ''
for target_sentence in target_sentences:
result = result + target_sentence
return result

def format_all_content(self, data: Dict[str, Any], **params: Dict[str, Any]):
contents = []
for query in data['messages']:
contents.append(query)
self.content = contents

def process_all_content(self) -> (Any, Any):
def serialize_content(content):
ret = ''
for message in content:
ret += '[#RS]{}[#RE][#CS]{}[#CE]'.format(message['role'], message['content'])
return ret
result = self.summarize_to_sentence(self.summarizer, [message['content'] for message in self.content], self.target_length)
save_content = serialize_content(self.content)
embedding_content = result
return save_content, embedding_content
24 changes: 24 additions & 0 deletions tests/unit_tests/processor/test_summarize_context.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from gptcache.processor.context.summarization_context import SummarizationContextProcess
from transformers import pipeline, RobertaTokenizer


def test_summarization_context_process():
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
context_process = SummarizationContextProcess(summarizer, None, 512)
chat = []
chat.append({"role": "user", "content": "What is the meaning of life? Explain it in 5 paragraphs."})
chat.append({"role": "system", "content": "The meaning of life is a philosophical question that has been debated throughout history. It is a complex and multifaceted topic, and different people may have different answers depending on their beliefs, values, and experiences. In general, though, the meaning of life is thought to be the reason for which we exist, the purpose that gives our lives significance and direction. \
One possible answer to the question of the meaning of life is that it is to seek happiness and fulfillment. This is a common theme in many philosophical and religious traditions, which often emphasize the importance of living a good and virtuous life in order to achieve happiness and fulfillment. For some, this may mean striving for personal growth and self-improvement, while for others it may involve pursuing spiritual enlightenment or a connection with a higher power. \
Another possible answer to the question of the meaning of life is that it is to contribute to the greater good. This approach emphasizes the importance of serving others and making the world a better place, whether through acts of kindness, charitable work, or other forms of service. For those who believe in this approach, the meaning of life is not just about individual happiness, but also about making a positive impact on the world and the lives of others. \
A third possible answer to the question of the meaning of life is that it is to discover and pursue one's passions and interests. This approach emphasizes the importance of living a life that is authentic and true to oneself, and of pursuing one's passions and interests with enthusiasm and dedication. For those who believe in this approach, the meaning of life is not just about achieving some external goal, but about living a life that is rich, fulfilling, and true to who we are. \
In conclusion, the meaning of life is a deeply personal and subjective question, and different people may have different answers. Some may believe that the meaning of life is to seek happiness and fulfillment, while others may believe that it is to contribute to the greater good or to pursue one's passions and interests. Ultimately, the meaning of life is a mystery that may never be fully understood, but that we can strive to uncover and understand through our experiences, beliefs, and values."})
chat.append({"role": "user", "content": "Shouldn't the answer be 42?"})
chat.append({"role": "system", "content": '''The answer "42" is a reference to a famous line in the book "The Hitchhiker's Guide to the Galaxy" by Douglas Adams. In the book, the character Deep Thought is asked to find the answer to the ultimate question of life, the universe, and everything. After thinking for a very long time, Deep Thought reveals that the answer is "42." However, the characters in the book then realize that they don't actually know what the question is, so the answer is ultimately meaningless. \
In other words, the answer "42" is a humorous and satirical take on the idea that there may be a single, definitive answer to the question of the meaning of life. It suggests that the search for such an answer may be futile, and that the meaning of life may be something that is ultimately unknowable. Therefore, while "42" is a famous and memorable line from a popular book, it is not necessarily a serious or meaningful answer to the question of the meaning of life.'''})

context_process.format_all_content({'messages':chat})
save_content, embedding_content = context_process.process_all_content()
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
assert len(tokenizer.tokenize(embedding_content)) < 512