From e3598765f44d345d2eb7a503ce69aefb9db49e59 Mon Sep 17 00:00:00 2001
From: duchenzhuang <15652580397@163.com>
Date: Tue, 19 Dec 2023 10:26:44 +0000
Subject: [PATCH 1/7] =?UTF-8?q?=E6=9B=B4=E6=96=B0cogvlm=20demo=E8=BF=9B?=
=?UTF-8?q?=E8=A1=8C=E7=BA=AF=E6=96=87=E6=9C=AC=E5=AF=B9=E8=AF=9D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
basic_demo/cli_demo_hf_cogvlm_chat.py | 102 ++++++++++++++++++++++++++
1 file changed, 102 insertions(+)
create mode 100644 basic_demo/cli_demo_hf_cogvlm_chat.py
diff --git a/basic_demo/cli_demo_hf_cogvlm_chat.py b/basic_demo/cli_demo_hf_cogvlm_chat.py
new file mode 100644
index 00000000..349d07fd
--- /dev/null
+++ b/basic_demo/cli_demo_hf_cogvlm_chat.py
@@ -0,0 +1,102 @@
+"""
+This is a demo for using CogAgent and CogVLM in CLI
+Make sure you have installed vicuna-7b-v1.5 tokenizer model (https://huggingface.co/lmsys/vicuna-7b-v1.5), full checkpoint of vicuna-7b-v1.5 LLM is not required.
+In this demo, We us chat template, you can use others to replace such as 'vqa'.
+Strongly suggest to use GPU with bfloat16 support, otherwise, it will be slow.
+Mention that only one picture can be processed at one conversation, which means you can not replace or insert another picture during the conversation.
+"""
+
+import argparse
+import torch
+
+from PIL import Image
+from transformers import AutoModelForCausalLM, LlamaTokenizer
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--quant", choices=[4], type=int, default=4, help='quantization bits')
+parser.add_argument("--from_pretrained", type=str, default="THUDM/cogvlm-chat-hf", help='pretrained ckpt')
+parser.add_argument("--local_tokenizer", type=str, default="lmsys/vicuna-7b-v1.5", help='tokenizer path')
+parser.add_argument("--fp16", action="store_true")
+parser.add_argument("--bf16", action="store_true")
+
+args = parser.parse_args()
+MODEL_PATH = args.from_pretrained
+TOKENIZER_PATH = args.local_tokenizer
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+
+tokenizer = LlamaTokenizer.from_pretrained(TOKENIZER_PATH)
+if args.bf16:
+ torch_type = torch.bfloat16
+else:
+ torch_type = torch.float16
+
+print("========Use torch type as:{} with device:{}========\n".format(torch_type, DEVICE))
+
+if args.quant:
+ model = AutoModelForCausalLM.from_pretrained(
+ MODEL_PATH,
+ torch_dtype=torch_type,
+ low_cpu_mem_usage=True,
+ load_in_4bit=True,
+ trust_remote_code=True
+ ).eval()
+else:
+ model = AutoModelForCausalLM.from_pretrained(
+ MODEL_PATH,
+ torch_dtype=torch_type,
+ low_cpu_mem_usage=True,
+ load_in_4bit=args.quant is not None,
+ trust_remote_code=True
+ ).to(DEVICE).eval()
+
+text_only_template = "A chat between a curious user and an artificial intelligence assistant. \
+ The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:"
+
+while True:
+ image_or_text = input("Chat with an image or text only? Please choose one from 'image' and 'text': ")
+ if image_or_text == 'image':
+ image_path = input("image path >>>>> ")
+ if image_path == "stop":
+ break
+ image = Image.open(image_path).convert('RGB')
+ else:
+ image = None
+ first_query = True
+ history = []
+
+ while True:
+ query = input("Human:")
+ if query == "clear":
+ break
+
+ if image is None:
+ if first_query:
+ query = text_only_template.format([query])
+ first_query = False
+ else:
+ query = "USER: {} ASSISTANT:".format([query])
+
+ if image is None:
+ input_by_model = model.build_conversation_input_ids(tokenizer, query=query, history=history)
+ else:
+ input_by_model = model.build_conversation_input_ids(tokenizer, query=query, history=history, images=[image])
+
+ inputs = {
+ 'input_ids': input_by_model['input_ids'].unsqueeze(0).to(DEVICE),
+ 'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).to(DEVICE),
+ 'attention_mask': input_by_model['attention_mask'].unsqueeze(0).to(DEVICE),
+ 'images': [[input_by_model['images'][0].to(DEVICE).to(torch_type)]] if image is not None else None,
+ }
+ if 'cross_images' in input_by_model and input_by_model['cross_images']:
+ inputs['cross_images'] = [[input_by_model['cross_images'][0].to(DEVICE).to(torch_type)]]
+
+ # add any transformers params here.
+ gen_kwargs = {"max_length": 2048,
+ "do_sample": False}
+ with torch.no_grad():
+ outputs = model.generate(**inputs, **gen_kwargs)
+ outputs = outputs[:, inputs['input_ids'].shape[1]:]
+ response = tokenizer.decode(outputs[0])
+ response = response.split("")[0]
+ print("\nCog:", response)
+ history.append((query, response))
\ No newline at end of file
From 4130e10410c3c48afc395081cf52123e7b993b04 Mon Sep 17 00:00:00 2001
From: duchenzhuang <15652580397@163.com>
Date: Tue, 19 Dec 2023 12:12:17 +0000
Subject: [PATCH 2/7] update cli_demo_hf
---
basic_demo/cli_demo_hf.py | 39 +++++++---
basic_demo/cli_demo_hf_cogvlm_chat.py | 102 --------------------------
2 files changed, 30 insertions(+), 111 deletions(-)
delete mode 100644 basic_demo/cli_demo_hf_cogvlm_chat.py
diff --git a/basic_demo/cli_demo_hf.py b/basic_demo/cli_demo_hf.py
index 76bd9444..65959cbd 100644
--- a/basic_demo/cli_demo_hf.py
+++ b/basic_demo/cli_demo_hf.py
@@ -14,7 +14,7 @@
parser = argparse.ArgumentParser()
parser.add_argument("--quant", choices=[4], type=int, default=None, help='quantization bits')
-parser.add_argument("--from_pretrained", type=str, default="THUDM/cogagent-chat-hf", help='pretrained ckpt')
+parser.add_argument("--from_pretrained", choices=["THUDM/cogagent-chat-hf", "THUDM/cogvlm-chat-hf"], type=str, default="THUDM/cogagent-chat-hf", help='pretrained ckpt')
parser.add_argument("--local_tokenizer", type=str, default="lmsys/vicuna-7b-v1.5", help='tokenizer path')
parser.add_argument("--fp16", action="store_true")
parser.add_argument("--bf16", action="store_true")
@@ -49,31 +49,52 @@
trust_remote_code=True
).to(DEVICE).eval()
+text_only_template = "A chat between a curious user and an artificial intelligence assistant. \
+ The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:"
+
while True:
- image_path = input("image path >>>>> ")
- if image_path == "stop":
- break
+ image_or_text = input("Chat with an image or text only? Please choose one from 'image' and 'text': ")
- image = Image.open(image_path).convert('RGB')
+ if image_or_text == 'image':
+ image_path = input("image path >>>>> ")
+ image = Image.open(image_path).convert('RGB')
+ elif image_or_text == 'text':
+ image = None
+ first_query = True
+ else:
+ print("Please choose one from 'image' and 'text'!")
+ break
+
history = []
+
while True:
query = input("Human:")
if query == "clear":
break
- input_by_model = model.build_conversation_input_ids(tokenizer, query=query, history=history, images=[image])
+ if image is None:
+ if first_query:
+ query = text_only_template.format([query])
+ first_query = False
+ else:
+ query = "USER: {} ASSISTANT:".format([query])
+
+ if image is None:
+ input_by_model = model.build_conversation_input_ids(tokenizer, query=query, history=history)
+ else:
+ input_by_model = model.build_conversation_input_ids(tokenizer, query=query, history=history, images=[image])
+
inputs = {
'input_ids': input_by_model['input_ids'].unsqueeze(0).to(DEVICE),
'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).to(DEVICE),
'attention_mask': input_by_model['attention_mask'].unsqueeze(0).to(DEVICE),
- 'images': [[input_by_model['images'][0].to(DEVICE).to(torch_type)]],
+ 'images': [[input_by_model['images'][0].to(DEVICE).to(torch_type)]] if image is not None else None,
}
if 'cross_images' in input_by_model and input_by_model['cross_images']:
inputs['cross_images'] = [[input_by_model['cross_images'][0].to(DEVICE).to(torch_type)]]
# add any transformers params here.
gen_kwargs = {"max_length": 2048,
- "temperature": 0.9,
- "do_sample": False}
+ "do_sample": False} # "temperature": 0.9
with torch.no_grad():
outputs = model.generate(**inputs, **gen_kwargs)
outputs = outputs[:, inputs['input_ids'].shape[1]:]
diff --git a/basic_demo/cli_demo_hf_cogvlm_chat.py b/basic_demo/cli_demo_hf_cogvlm_chat.py
deleted file mode 100644
index 349d07fd..00000000
--- a/basic_demo/cli_demo_hf_cogvlm_chat.py
+++ /dev/null
@@ -1,102 +0,0 @@
-"""
-This is a demo for using CogAgent and CogVLM in CLI
-Make sure you have installed vicuna-7b-v1.5 tokenizer model (https://huggingface.co/lmsys/vicuna-7b-v1.5), full checkpoint of vicuna-7b-v1.5 LLM is not required.
-In this demo, We us chat template, you can use others to replace such as 'vqa'.
-Strongly suggest to use GPU with bfloat16 support, otherwise, it will be slow.
-Mention that only one picture can be processed at one conversation, which means you can not replace or insert another picture during the conversation.
-"""
-
-import argparse
-import torch
-
-from PIL import Image
-from transformers import AutoModelForCausalLM, LlamaTokenizer
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--quant", choices=[4], type=int, default=4, help='quantization bits')
-parser.add_argument("--from_pretrained", type=str, default="THUDM/cogvlm-chat-hf", help='pretrained ckpt')
-parser.add_argument("--local_tokenizer", type=str, default="lmsys/vicuna-7b-v1.5", help='tokenizer path')
-parser.add_argument("--fp16", action="store_true")
-parser.add_argument("--bf16", action="store_true")
-
-args = parser.parse_args()
-MODEL_PATH = args.from_pretrained
-TOKENIZER_PATH = args.local_tokenizer
-DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
-
-tokenizer = LlamaTokenizer.from_pretrained(TOKENIZER_PATH)
-if args.bf16:
- torch_type = torch.bfloat16
-else:
- torch_type = torch.float16
-
-print("========Use torch type as:{} with device:{}========\n".format(torch_type, DEVICE))
-
-if args.quant:
- model = AutoModelForCausalLM.from_pretrained(
- MODEL_PATH,
- torch_dtype=torch_type,
- low_cpu_mem_usage=True,
- load_in_4bit=True,
- trust_remote_code=True
- ).eval()
-else:
- model = AutoModelForCausalLM.from_pretrained(
- MODEL_PATH,
- torch_dtype=torch_type,
- low_cpu_mem_usage=True,
- load_in_4bit=args.quant is not None,
- trust_remote_code=True
- ).to(DEVICE).eval()
-
-text_only_template = "A chat between a curious user and an artificial intelligence assistant. \
- The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:"
-
-while True:
- image_or_text = input("Chat with an image or text only? Please choose one from 'image' and 'text': ")
- if image_or_text == 'image':
- image_path = input("image path >>>>> ")
- if image_path == "stop":
- break
- image = Image.open(image_path).convert('RGB')
- else:
- image = None
- first_query = True
- history = []
-
- while True:
- query = input("Human:")
- if query == "clear":
- break
-
- if image is None:
- if first_query:
- query = text_only_template.format([query])
- first_query = False
- else:
- query = "USER: {} ASSISTANT:".format([query])
-
- if image is None:
- input_by_model = model.build_conversation_input_ids(tokenizer, query=query, history=history)
- else:
- input_by_model = model.build_conversation_input_ids(tokenizer, query=query, history=history, images=[image])
-
- inputs = {
- 'input_ids': input_by_model['input_ids'].unsqueeze(0).to(DEVICE),
- 'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).to(DEVICE),
- 'attention_mask': input_by_model['attention_mask'].unsqueeze(0).to(DEVICE),
- 'images': [[input_by_model['images'][0].to(DEVICE).to(torch_type)]] if image is not None else None,
- }
- if 'cross_images' in input_by_model and input_by_model['cross_images']:
- inputs['cross_images'] = [[input_by_model['cross_images'][0].to(DEVICE).to(torch_type)]]
-
- # add any transformers params here.
- gen_kwargs = {"max_length": 2048,
- "do_sample": False}
- with torch.no_grad():
- outputs = model.generate(**inputs, **gen_kwargs)
- outputs = outputs[:, inputs['input_ids'].shape[1]:]
- response = tokenizer.decode(outputs[0])
- response = response.split("")[0]
- print("\nCog:", response)
- history.append((query, response))
\ No newline at end of file
From 2493b815370b8d76f3723d96bbf95b1ce17bc0a9 Mon Sep 17 00:00:00 2001
From: duchenzhuang <15652580397@163.com>
Date: Tue, 19 Dec 2023 12:34:10 +0000
Subject: [PATCH 3/7] update basic hf demo
---
basic_demo/cli_demo_hf.py | 22 +++++++---------------
1 file changed, 7 insertions(+), 15 deletions(-)
diff --git a/basic_demo/cli_demo_hf.py b/basic_demo/cli_demo_hf.py
index 65959cbd..96e513ad 100644
--- a/basic_demo/cli_demo_hf.py
+++ b/basic_demo/cli_demo_hf.py
@@ -14,7 +14,7 @@
parser = argparse.ArgumentParser()
parser.add_argument("--quant", choices=[4], type=int, default=None, help='quantization bits')
-parser.add_argument("--from_pretrained", choices=["THUDM/cogagent-chat-hf", "THUDM/cogvlm-chat-hf"], type=str, default="THUDM/cogagent-chat-hf", help='pretrained ckpt')
+parser.add_argument("--from_pretrained", type=str, default="THUDM/cogagent-chat-hf", help='pretrained ckpt')
parser.add_argument("--local_tokenizer", type=str, default="lmsys/vicuna-7b-v1.5", help='tokenizer path')
parser.add_argument("--fp16", action="store_true")
parser.add_argument("--bf16", action="store_true")
@@ -53,17 +53,13 @@
The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:"
while True:
- image_or_text = input("Chat with an image or text only? Please choose one from 'image' and 'text': ")
-
- if image_or_text == 'image':
- image_path = input("image path >>>>> ")
- image = Image.open(image_path).convert('RGB')
- elif image_or_text == 'text':
+ image_path = input("image path >>>>> ")
+ if image_path == '':
+ print('You did not enter image path, the following will be a plain text conversation.')
image = None
- first_query = True
+ first_query = True
else:
- print("Please choose one from 'image' and 'text'!")
- break
+ image = Image.open(image_path).convert('RGB')
history = []
@@ -78,11 +74,7 @@
else:
query = "USER: {} ASSISTANT:".format([query])
- if image is None:
- input_by_model = model.build_conversation_input_ids(tokenizer, query=query, history=history)
- else:
- input_by_model = model.build_conversation_input_ids(tokenizer, query=query, history=history, images=[image])
-
+ input_by_model = model.build_conversation_input_ids(tokenizer, query=query, history=history, images=[image] if image is not None else None)
inputs = {
'input_ids': input_by_model['input_ids'].unsqueeze(0).to(DEVICE),
'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).to(DEVICE),
From c33ad63eb922a809a713a5777aa58f30abb3747e Mon Sep 17 00:00:00 2001
From: duchenzhuang <15652580397@163.com>
Date: Wed, 20 Dec 2023 02:24:28 +0000
Subject: [PATCH 4/7] update basic_demo_hf
---
basic_demo/cli_demo_hf.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/basic_demo/cli_demo_hf.py b/basic_demo/cli_demo_hf.py
index 96e513ad..67bdd581 100644
--- a/basic_demo/cli_demo_hf.py
+++ b/basic_demo/cli_demo_hf.py
@@ -57,7 +57,7 @@
if image_path == '':
print('You did not enter image path, the following will be a plain text conversation.')
image = None
- first_query = True
+ text_only_first_query = True
else:
image = Image.open(image_path).convert('RGB')
@@ -68,9 +68,9 @@
if query == "clear":
break
if image is None:
- if first_query:
+ if text_only_first_query:
query = text_only_template.format([query])
- first_query = False
+ text_only_first_query = False
else:
query = "USER: {} ASSISTANT:".format([query])
From 54a57c42e8b38ddbdd77682e399d25a1cbe9aed3 Mon Sep 17 00:00:00 2001
From: duchenzhuang <15652580397@163.com>
Date: Wed, 20 Dec 2023 03:34:11 +0000
Subject: [PATCH 5/7] update base_cli_demo_hf
---
basic_demo/cli_demo_hf.py | 17 ++++++++++++-----
1 file changed, 12 insertions(+), 5 deletions(-)
diff --git a/basic_demo/cli_demo_hf.py b/basic_demo/cli_demo_hf.py
index 67bdd581..419d5232 100644
--- a/basic_demo/cli_demo_hf.py
+++ b/basic_demo/cli_demo_hf.py
@@ -49,8 +49,7 @@
trust_remote_code=True
).to(DEVICE).eval()
-text_only_template = "A chat between a curious user and an artificial intelligence assistant. \
- The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:"
+text_only_template = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:"
while True:
image_path = input("image path >>>>> ")
@@ -67,14 +66,22 @@
query = input("Human:")
if query == "clear":
break
+
if image is None:
if text_only_first_query:
- query = text_only_template.format([query])
+ query = text_only_template.format(query)
text_only_first_query = False
else:
- query = "USER: {} ASSISTANT:".format([query])
+ old_prompt = ''
+ for _, (old_query, response) in enumerate(history):
+ old_prompt += old_query + " " + response + "\n"
+ query = old_prompt + "USER: {} ASSISTANT:".format(query)
+
+ if image is None:
+ input_by_model = model.build_conversation_input_ids(tokenizer, query=query, history=history, template_version='base')
+ else:
+ input_by_model = model.build_conversation_input_ids(tokenizer, query=query, history=history, images=[image])
- input_by_model = model.build_conversation_input_ids(tokenizer, query=query, history=history, images=[image] if image is not None else None)
inputs = {
'input_ids': input_by_model['input_ids'].unsqueeze(0).to(DEVICE),
'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).to(DEVICE),
From e4ee77f3b528061515fc9712d8ee6bbac34b698d Mon Sep 17 00:00:00 2001
From: wenyihong <48993524+wenyihong@users.noreply.github.com>
Date: Thu, 21 Dec 2023 16:02:03 +0800
Subject: [PATCH 6/7] Update README.md
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index dbc1e893..edfefd1d 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ dialogue with images, GUI Agent, Grounding**, and more.
🌟 **Jump to detailed introduction: [Introduction to CogVLM](#introduction-to-cogvlm),
🆕 [Introduction to CogAgent](#introduction-to-cogagent)**
-📔 For more detailed usage information, please refer to: [CogVLM technical documentation(Only Chinese)](https://zhipu-ai.feishu.cn/wiki/LXQIwqo1OiIVTykMh9Lc3w1Fn7g)
+📔 For more detailed usage information, please refer to: [CogVLM & CogAgent's technical documentation (in Chinese)](https://zhipu-ai.feishu.cn/wiki/LXQIwqo1OiIVTykMh9Lc3w1Fn7g)
From a39ea6e7be7d78da75affa855ea31139b8680f62 Mon Sep 17 00:00:00 2001
From: wenyihong <48993524+wenyihong@users.noreply.github.com>
Date: Thu, 21 Dec 2023 16:02:51 +0800
Subject: [PATCH 7/7] Update README_zh.md
---
README_zh.md | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/README_zh.md b/README_zh.md
index 93dc1c94..ac0648fe 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -1,6 +1,6 @@
# CogVLM & CogAgent
-📗 [ä¸æ–‡ç‰ˆREADME](./README_zh.md)
+📗 [README in English](./README.md)
🔥🔥🔥 🆕 ```2023/12/15```: CogAgentæ£å¼ä¸Šçº¿ï¼CogAgent是基于CogVLMå¼€å‘的图åƒç†è§£æ¨¡åž‹ã€‚它具有基于视觉的GUI
Agent功能,并在图åƒç†è§£æ–¹é¢æœ‰è¿›ä¸€æ¥çš„增强。它支æŒ1120*1120分辨率的图åƒè¾“入,并具有包括与图åƒè¿›è¡Œå¤šè½®å¯¹è¯ã€GUI
@@ -9,7 +9,7 @@ Agentã€Groundingç‰å¤šç§èƒ½åŠ›ã€‚
🌟 **跳转到详细介ç»: [CogVLM介ç»](#introduction-to-cogvlm),
🆕 [CogAgent的介ç»](#introduction-to-cogagent)**
-📔 如需获å–更详细的使用信æ¯ï¼Œè¯·å‚阅: [CogVLM技术文档](https://zhipu-ai.feishu.cn/wiki/LXQIwqo1OiIVTykMh9Lc3w1Fn7g)
+📔 如需获å–更详细的使用信æ¯ï¼Œè¯·å‚阅: [CogVLM&CogAgent技术文档](https://zhipu-ai.feishu.cn/wiki/LXQIwqo1OiIVTykMh9Lc3w1Fn7g)