From e3598765f44d345d2eb7a503ce69aefb9db49e59 Mon Sep 17 00:00:00 2001 From: duchenzhuang <15652580397@163.com> Date: Tue, 19 Dec 2023 10:26:44 +0000 Subject: [PATCH 1/7] =?UTF-8?q?=E6=9B=B4=E6=96=B0cogvlm=20demo=E8=BF=9B?= =?UTF-8?q?=E8=A1=8C=E7=BA=AF=E6=96=87=E6=9C=AC=E5=AF=B9=E8=AF=9D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- basic_demo/cli_demo_hf_cogvlm_chat.py | 102 ++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 basic_demo/cli_demo_hf_cogvlm_chat.py diff --git a/basic_demo/cli_demo_hf_cogvlm_chat.py b/basic_demo/cli_demo_hf_cogvlm_chat.py new file mode 100644 index 00000000..349d07fd --- /dev/null +++ b/basic_demo/cli_demo_hf_cogvlm_chat.py @@ -0,0 +1,102 @@ +""" +This is a demo for using CogAgent and CogVLM in CLI +Make sure you have installed vicuna-7b-v1.5 tokenizer model (https://huggingface.co/lmsys/vicuna-7b-v1.5), full checkpoint of vicuna-7b-v1.5 LLM is not required. +In this demo, We us chat template, you can use others to replace such as 'vqa'. +Strongly suggest to use GPU with bfloat16 support, otherwise, it will be slow. +Mention that only one picture can be processed at one conversation, which means you can not replace or insert another picture during the conversation. +""" + +import argparse +import torch + +from PIL import Image +from transformers import AutoModelForCausalLM, LlamaTokenizer + +parser = argparse.ArgumentParser() +parser.add_argument("--quant", choices=[4], type=int, default=4, help='quantization bits') +parser.add_argument("--from_pretrained", type=str, default="THUDM/cogvlm-chat-hf", help='pretrained ckpt') +parser.add_argument("--local_tokenizer", type=str, default="lmsys/vicuna-7b-v1.5", help='tokenizer path') +parser.add_argument("--fp16", action="store_true") +parser.add_argument("--bf16", action="store_true") + +args = parser.parse_args() +MODEL_PATH = args.from_pretrained +TOKENIZER_PATH = args.local_tokenizer +DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' + +tokenizer = LlamaTokenizer.from_pretrained(TOKENIZER_PATH) +if args.bf16: + torch_type = torch.bfloat16 +else: + torch_type = torch.float16 + +print("========Use torch type as:{} with device:{}========\n".format(torch_type, DEVICE)) + +if args.quant: + model = AutoModelForCausalLM.from_pretrained( + MODEL_PATH, + torch_dtype=torch_type, + low_cpu_mem_usage=True, + load_in_4bit=True, + trust_remote_code=True + ).eval() +else: + model = AutoModelForCausalLM.from_pretrained( + MODEL_PATH, + torch_dtype=torch_type, + low_cpu_mem_usage=True, + load_in_4bit=args.quant is not None, + trust_remote_code=True + ).to(DEVICE).eval() + +text_only_template = "A chat between a curious user and an artificial intelligence assistant. \ + The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:" + +while True: + image_or_text = input("Chat with an image or text only? Please choose one from 'image' and 'text': ") + if image_or_text == 'image': + image_path = input("image path >>>>> ") + if image_path == "stop": + break + image = Image.open(image_path).convert('RGB') + else: + image = None + first_query = True + history = [] + + while True: + query = input("Human:") + if query == "clear": + break + + if image is None: + if first_query: + query = text_only_template.format([query]) + first_query = False + else: + query = "USER: {} ASSISTANT:".format([query]) + + if image is None: + input_by_model = model.build_conversation_input_ids(tokenizer, query=query, history=history) + else: + input_by_model = model.build_conversation_input_ids(tokenizer, query=query, history=history, images=[image]) + + inputs = { + 'input_ids': input_by_model['input_ids'].unsqueeze(0).to(DEVICE), + 'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).to(DEVICE), + 'attention_mask': input_by_model['attention_mask'].unsqueeze(0).to(DEVICE), + 'images': [[input_by_model['images'][0].to(DEVICE).to(torch_type)]] if image is not None else None, + } + if 'cross_images' in input_by_model and input_by_model['cross_images']: + inputs['cross_images'] = [[input_by_model['cross_images'][0].to(DEVICE).to(torch_type)]] + + # add any transformers params here. + gen_kwargs = {"max_length": 2048, + "do_sample": False} + with torch.no_grad(): + outputs = model.generate(**inputs, **gen_kwargs) + outputs = outputs[:, inputs['input_ids'].shape[1]:] + response = tokenizer.decode(outputs[0]) + response = response.split("")[0] + print("\nCog:", response) + history.append((query, response)) \ No newline at end of file From 4130e10410c3c48afc395081cf52123e7b993b04 Mon Sep 17 00:00:00 2001 From: duchenzhuang <15652580397@163.com> Date: Tue, 19 Dec 2023 12:12:17 +0000 Subject: [PATCH 2/7] update cli_demo_hf --- basic_demo/cli_demo_hf.py | 39 +++++++--- basic_demo/cli_demo_hf_cogvlm_chat.py | 102 -------------------------- 2 files changed, 30 insertions(+), 111 deletions(-) delete mode 100644 basic_demo/cli_demo_hf_cogvlm_chat.py diff --git a/basic_demo/cli_demo_hf.py b/basic_demo/cli_demo_hf.py index 76bd9444..65959cbd 100644 --- a/basic_demo/cli_demo_hf.py +++ b/basic_demo/cli_demo_hf.py @@ -14,7 +14,7 @@ parser = argparse.ArgumentParser() parser.add_argument("--quant", choices=[4], type=int, default=None, help='quantization bits') -parser.add_argument("--from_pretrained", type=str, default="THUDM/cogagent-chat-hf", help='pretrained ckpt') +parser.add_argument("--from_pretrained", choices=["THUDM/cogagent-chat-hf", "THUDM/cogvlm-chat-hf"], type=str, default="THUDM/cogagent-chat-hf", help='pretrained ckpt') parser.add_argument("--local_tokenizer", type=str, default="lmsys/vicuna-7b-v1.5", help='tokenizer path') parser.add_argument("--fp16", action="store_true") parser.add_argument("--bf16", action="store_true") @@ -49,31 +49,52 @@ trust_remote_code=True ).to(DEVICE).eval() +text_only_template = "A chat between a curious user and an artificial intelligence assistant. \ + The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:" + while True: - image_path = input("image path >>>>> ") - if image_path == "stop": - break + image_or_text = input("Chat with an image or text only? Please choose one from 'image' and 'text': ") - image = Image.open(image_path).convert('RGB') + if image_or_text == 'image': + image_path = input("image path >>>>> ") + image = Image.open(image_path).convert('RGB') + elif image_or_text == 'text': + image = None + first_query = True + else: + print("Please choose one from 'image' and 'text'!") + break + history = [] + while True: query = input("Human:") if query == "clear": break - input_by_model = model.build_conversation_input_ids(tokenizer, query=query, history=history, images=[image]) + if image is None: + if first_query: + query = text_only_template.format([query]) + first_query = False + else: + query = "USER: {} ASSISTANT:".format([query]) + + if image is None: + input_by_model = model.build_conversation_input_ids(tokenizer, query=query, history=history) + else: + input_by_model = model.build_conversation_input_ids(tokenizer, query=query, history=history, images=[image]) + inputs = { 'input_ids': input_by_model['input_ids'].unsqueeze(0).to(DEVICE), 'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).to(DEVICE), 'attention_mask': input_by_model['attention_mask'].unsqueeze(0).to(DEVICE), - 'images': [[input_by_model['images'][0].to(DEVICE).to(torch_type)]], + 'images': [[input_by_model['images'][0].to(DEVICE).to(torch_type)]] if image is not None else None, } if 'cross_images' in input_by_model and input_by_model['cross_images']: inputs['cross_images'] = [[input_by_model['cross_images'][0].to(DEVICE).to(torch_type)]] # add any transformers params here. gen_kwargs = {"max_length": 2048, - "temperature": 0.9, - "do_sample": False} + "do_sample": False} # "temperature": 0.9 with torch.no_grad(): outputs = model.generate(**inputs, **gen_kwargs) outputs = outputs[:, inputs['input_ids'].shape[1]:] diff --git a/basic_demo/cli_demo_hf_cogvlm_chat.py b/basic_demo/cli_demo_hf_cogvlm_chat.py deleted file mode 100644 index 349d07fd..00000000 --- a/basic_demo/cli_demo_hf_cogvlm_chat.py +++ /dev/null @@ -1,102 +0,0 @@ -""" -This is a demo for using CogAgent and CogVLM in CLI -Make sure you have installed vicuna-7b-v1.5 tokenizer model (https://huggingface.co/lmsys/vicuna-7b-v1.5), full checkpoint of vicuna-7b-v1.5 LLM is not required. -In this demo, We us chat template, you can use others to replace such as 'vqa'. -Strongly suggest to use GPU with bfloat16 support, otherwise, it will be slow. -Mention that only one picture can be processed at one conversation, which means you can not replace or insert another picture during the conversation. -""" - -import argparse -import torch - -from PIL import Image -from transformers import AutoModelForCausalLM, LlamaTokenizer - -parser = argparse.ArgumentParser() -parser.add_argument("--quant", choices=[4], type=int, default=4, help='quantization bits') -parser.add_argument("--from_pretrained", type=str, default="THUDM/cogvlm-chat-hf", help='pretrained ckpt') -parser.add_argument("--local_tokenizer", type=str, default="lmsys/vicuna-7b-v1.5", help='tokenizer path') -parser.add_argument("--fp16", action="store_true") -parser.add_argument("--bf16", action="store_true") - -args = parser.parse_args() -MODEL_PATH = args.from_pretrained -TOKENIZER_PATH = args.local_tokenizer -DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' - -tokenizer = LlamaTokenizer.from_pretrained(TOKENIZER_PATH) -if args.bf16: - torch_type = torch.bfloat16 -else: - torch_type = torch.float16 - -print("========Use torch type as:{} with device:{}========\n".format(torch_type, DEVICE)) - -if args.quant: - model = AutoModelForCausalLM.from_pretrained( - MODEL_PATH, - torch_dtype=torch_type, - low_cpu_mem_usage=True, - load_in_4bit=True, - trust_remote_code=True - ).eval() -else: - model = AutoModelForCausalLM.from_pretrained( - MODEL_PATH, - torch_dtype=torch_type, - low_cpu_mem_usage=True, - load_in_4bit=args.quant is not None, - trust_remote_code=True - ).to(DEVICE).eval() - -text_only_template = "A chat between a curious user and an artificial intelligence assistant. \ - The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:" - -while True: - image_or_text = input("Chat with an image or text only? Please choose one from 'image' and 'text': ") - if image_or_text == 'image': - image_path = input("image path >>>>> ") - if image_path == "stop": - break - image = Image.open(image_path).convert('RGB') - else: - image = None - first_query = True - history = [] - - while True: - query = input("Human:") - if query == "clear": - break - - if image is None: - if first_query: - query = text_only_template.format([query]) - first_query = False - else: - query = "USER: {} ASSISTANT:".format([query]) - - if image is None: - input_by_model = model.build_conversation_input_ids(tokenizer, query=query, history=history) - else: - input_by_model = model.build_conversation_input_ids(tokenizer, query=query, history=history, images=[image]) - - inputs = { - 'input_ids': input_by_model['input_ids'].unsqueeze(0).to(DEVICE), - 'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).to(DEVICE), - 'attention_mask': input_by_model['attention_mask'].unsqueeze(0).to(DEVICE), - 'images': [[input_by_model['images'][0].to(DEVICE).to(torch_type)]] if image is not None else None, - } - if 'cross_images' in input_by_model and input_by_model['cross_images']: - inputs['cross_images'] = [[input_by_model['cross_images'][0].to(DEVICE).to(torch_type)]] - - # add any transformers params here. - gen_kwargs = {"max_length": 2048, - "do_sample": False} - with torch.no_grad(): - outputs = model.generate(**inputs, **gen_kwargs) - outputs = outputs[:, inputs['input_ids'].shape[1]:] - response = tokenizer.decode(outputs[0]) - response = response.split("")[0] - print("\nCog:", response) - history.append((query, response)) \ No newline at end of file From 2493b815370b8d76f3723d96bbf95b1ce17bc0a9 Mon Sep 17 00:00:00 2001 From: duchenzhuang <15652580397@163.com> Date: Tue, 19 Dec 2023 12:34:10 +0000 Subject: [PATCH 3/7] update basic hf demo --- basic_demo/cli_demo_hf.py | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/basic_demo/cli_demo_hf.py b/basic_demo/cli_demo_hf.py index 65959cbd..96e513ad 100644 --- a/basic_demo/cli_demo_hf.py +++ b/basic_demo/cli_demo_hf.py @@ -14,7 +14,7 @@ parser = argparse.ArgumentParser() parser.add_argument("--quant", choices=[4], type=int, default=None, help='quantization bits') -parser.add_argument("--from_pretrained", choices=["THUDM/cogagent-chat-hf", "THUDM/cogvlm-chat-hf"], type=str, default="THUDM/cogagent-chat-hf", help='pretrained ckpt') +parser.add_argument("--from_pretrained", type=str, default="THUDM/cogagent-chat-hf", help='pretrained ckpt') parser.add_argument("--local_tokenizer", type=str, default="lmsys/vicuna-7b-v1.5", help='tokenizer path') parser.add_argument("--fp16", action="store_true") parser.add_argument("--bf16", action="store_true") @@ -53,17 +53,13 @@ The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:" while True: - image_or_text = input("Chat with an image or text only? Please choose one from 'image' and 'text': ") - - if image_or_text == 'image': - image_path = input("image path >>>>> ") - image = Image.open(image_path).convert('RGB') - elif image_or_text == 'text': + image_path = input("image path >>>>> ") + if image_path == '': + print('You did not enter image path, the following will be a plain text conversation.') image = None - first_query = True + first_query = True else: - print("Please choose one from 'image' and 'text'!") - break + image = Image.open(image_path).convert('RGB') history = [] @@ -78,11 +74,7 @@ else: query = "USER: {} ASSISTANT:".format([query]) - if image is None: - input_by_model = model.build_conversation_input_ids(tokenizer, query=query, history=history) - else: - input_by_model = model.build_conversation_input_ids(tokenizer, query=query, history=history, images=[image]) - + input_by_model = model.build_conversation_input_ids(tokenizer, query=query, history=history, images=[image] if image is not None else None) inputs = { 'input_ids': input_by_model['input_ids'].unsqueeze(0).to(DEVICE), 'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).to(DEVICE), From c33ad63eb922a809a713a5777aa58f30abb3747e Mon Sep 17 00:00:00 2001 From: duchenzhuang <15652580397@163.com> Date: Wed, 20 Dec 2023 02:24:28 +0000 Subject: [PATCH 4/7] update basic_demo_hf --- basic_demo/cli_demo_hf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/basic_demo/cli_demo_hf.py b/basic_demo/cli_demo_hf.py index 96e513ad..67bdd581 100644 --- a/basic_demo/cli_demo_hf.py +++ b/basic_demo/cli_demo_hf.py @@ -57,7 +57,7 @@ if image_path == '': print('You did not enter image path, the following will be a plain text conversation.') image = None - first_query = True + text_only_first_query = True else: image = Image.open(image_path).convert('RGB') @@ -68,9 +68,9 @@ if query == "clear": break if image is None: - if first_query: + if text_only_first_query: query = text_only_template.format([query]) - first_query = False + text_only_first_query = False else: query = "USER: {} ASSISTANT:".format([query]) From 54a57c42e8b38ddbdd77682e399d25a1cbe9aed3 Mon Sep 17 00:00:00 2001 From: duchenzhuang <15652580397@163.com> Date: Wed, 20 Dec 2023 03:34:11 +0000 Subject: [PATCH 5/7] update base_cli_demo_hf --- basic_demo/cli_demo_hf.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/basic_demo/cli_demo_hf.py b/basic_demo/cli_demo_hf.py index 67bdd581..419d5232 100644 --- a/basic_demo/cli_demo_hf.py +++ b/basic_demo/cli_demo_hf.py @@ -49,8 +49,7 @@ trust_remote_code=True ).to(DEVICE).eval() -text_only_template = "A chat between a curious user and an artificial intelligence assistant. \ - The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:" +text_only_template = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:" while True: image_path = input("image path >>>>> ") @@ -67,14 +66,22 @@ query = input("Human:") if query == "clear": break + if image is None: if text_only_first_query: - query = text_only_template.format([query]) + query = text_only_template.format(query) text_only_first_query = False else: - query = "USER: {} ASSISTANT:".format([query]) + old_prompt = '' + for _, (old_query, response) in enumerate(history): + old_prompt += old_query + " " + response + "\n" + query = old_prompt + "USER: {} ASSISTANT:".format(query) + + if image is None: + input_by_model = model.build_conversation_input_ids(tokenizer, query=query, history=history, template_version='base') + else: + input_by_model = model.build_conversation_input_ids(tokenizer, query=query, history=history, images=[image]) - input_by_model = model.build_conversation_input_ids(tokenizer, query=query, history=history, images=[image] if image is not None else None) inputs = { 'input_ids': input_by_model['input_ids'].unsqueeze(0).to(DEVICE), 'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).to(DEVICE), From e4ee77f3b528061515fc9712d8ee6bbac34b698d Mon Sep 17 00:00:00 2001 From: wenyihong <48993524+wenyihong@users.noreply.github.com> Date: Thu, 21 Dec 2023 16:02:03 +0800 Subject: [PATCH 6/7] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index dbc1e893..edfefd1d 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ dialogue with images, GUI Agent, Grounding**, and more. 🌟 **Jump to detailed introduction: [Introduction to CogVLM](#introduction-to-cogvlm), 🆕 [Introduction to CogAgent](#introduction-to-cogagent)** -📔 For more detailed usage information, please refer to: [CogVLM technical documentation(Only Chinese)](https://zhipu-ai.feishu.cn/wiki/LXQIwqo1OiIVTykMh9Lc3w1Fn7g) +📔 For more detailed usage information, please refer to: [CogVLM & CogAgent's technical documentation (in Chinese)](https://zhipu-ai.feishu.cn/wiki/LXQIwqo1OiIVTykMh9Lc3w1Fn7g) From a39ea6e7be7d78da75affa855ea31139b8680f62 Mon Sep 17 00:00:00 2001 From: wenyihong <48993524+wenyihong@users.noreply.github.com> Date: Thu, 21 Dec 2023 16:02:51 +0800 Subject: [PATCH 7/7] Update README_zh.md --- README_zh.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README_zh.md b/README_zh.md index 93dc1c94..ac0648fe 100644 --- a/README_zh.md +++ b/README_zh.md @@ -1,6 +1,6 @@ # CogVLM & CogAgent -📗 [中文版README](./README_zh.md) +📗 [README in English](./README.md) 🔥🔥🔥 🆕 ```2023/12/15```: CogAgent正式上线!CogAgent是基于CogVLM开发的图像理解模型。它具有基于视觉的GUI Agent功能,并在图像理解方面有进一步的增强。它支持1120*1120分辨率的图像输入,并具有包括与图像进行多轮对话、GUI @@ -9,7 +9,7 @@ Agent、Grounding等多种能力。 🌟 **跳转到详细介绍: [CogVLM介绍](#introduction-to-cogvlm), 🆕 [CogAgent的介绍](#introduction-to-cogagent)** -📔 如需获取更详细的使用信息,请参阅: [CogVLM技术文档](https://zhipu-ai.feishu.cn/wiki/LXQIwqo1OiIVTykMh9Lc3w1Fn7g) +📔 如需获取更详细的使用信息,请参阅: [CogVLM&CogAgent技术文档](https://zhipu-ai.feishu.cn/wiki/LXQIwqo1OiIVTykMh9Lc3w1Fn7g)