Add MM-Vet v2

yuweihao · Aug 1, 2024 · dc73d64 · dc73d64
1 parent 5f5841c
commit dc73d64
Show file tree

Hide file tree

Showing 9 changed files with 2,442 additions and 0 deletions.
diff --git a/v2/README.md b/v2/README.md
@@ -0,0 +1,56 @@
+<p align="center">
+<img src="https://raw.githubusercontent.com/yuweihao/misc/master/MM-Vet/mm-vet-v2_logo.jpg" width="400"> <br>
+</p>
+
+
+# [MM-Vet v2: A Challenging Benchmark to Evaluate Large Multimodal Models for Integrated Capabilities](https://arxiv.org/abs/xxxx.xxxxx)
+
+<p align="center">
+[<a href="https://arxiv.org/abs/xxxx.xxxxx">Paper</a>] 
+[<a href="https://github.com/yuweihao/MM-Vet/releases/download/v2/mm-vet-v2.zip">Download Dataset</a>]
+[<a href="https://paperswithcode.com/sota/visual-question-answering-on-mm-vet-v2"><b>Leaderboard</b></a>]
+[<a href="https://huggingface.co/spaces/whyu/MM-Vet-v2_Evaluator">Online Evaluator</a>]
+</p>
+
+
+
+![MM-Vet v2 examples](https://raw.githubusercontent.com/yuweihao/misc/master/MM-Vet/mm-vet-v2_examples.jpg)
+Figure 1: Four examples from MM-Vet v2. Compared with MM-Vet, MM-Vet v2 introduces more high-quality evaluation samples (e.g., (a) and (b)), and the ones with the new capability of image-text sequence understanding (e.g., (c) and (d)).
+
+## Evalute your model on MM-Vet v2
+**Step 0**: Install openai package with `pip install openai>=1` and get access GPT-4 API. If you have not access, you can try MM-Vet v2 online evaluator [Hugging Face Space](https://huggingface.co/spaces/whyu/MM-Vet-v2_Evaluator) (but it may wait for long time depending on number of users).
+
+**Step 1**:  Download MM-Vet v2 data [here](https://github.com/yuweihao/MM-Vet/releases/download/v2/mm-vet-v2.zip) and unzip `unzip mm-vet-v2.zip`.
+
+**Step 2**: Infer your model on MM-Vet v2 and save your model outputs in json like [gpt-4o-2024-05-13_detail-high.json](results/gpt-4o-2024-05-13_detail-high.json), or just use [gpt-4o-2024-05-13_detail-high.json](results/gpt-4o-2024-05-13_detail-high.json) as example to evaluate. We also release inference scripts for GPT-4, Claude and Gemini.
+
+```bash
+image_detail=high # or auto, low refer to https://platform.openai.com/docs/guides/vision/low-or-high-fidelity-image-understanding
+
+python inference/gpt4.py --mmvetv2_path /path/to/mm-vet-v2 --model_name gpt-4o-2024-05-13 --image_detail ${image_detail}
+```
+
+```bash
+python inference/claude.py --mmvetv2_path /path/to/mm-vet-v2 --model_name claude-3-5-sonnet-20240620
+```
+
+```bash
+python inference/gemini.py --mmvetv2_path /path/to/mm-vet-v2 --model_name gemini-1.5-pro
+```
+
+**Step 3**: `git clone https://github.com/yuweihao/MM-Vet.git && cd MM-Vet/v2`, run LLM-based evaluator
+```bash
+python mm-vet-v2_evaluator.py --mmvetv2_path /path/to/mm-vet-v2 --result_file results/gpt-4o-2024-05-13_detail-high.json
+```
+If you cannot access GPT-4 (gpt-4-0613), you can upload your model output results (json file) to MM-Vet v2 online evaluator [Hugging Face Space](https://huggingface.co/spaces/whyu/MM-Vet-v2_Evaluator) to get the grading results.
+
+
+## Citation
+```
+@article{yu2024mmvetv2,
+  title={MM-Vet v2: A Challenging Benchmark to Evaluate Large Multimodal Models for Integrated Capabilities},
+  author={Weihao Yu and Zhengyuan Yang and Lingfeng Ren and Linjie Li and Jianfeng Wang and Kevin Lin and Chung-Ching Lin and Zicheng Liu and Lijuan Wang and Xinchao Wang},
+  journal={arXiv preprint arXiv:xxxx.xxxxx},
+  year={2024}
+}
+```
diff --git a/v2/inference/claude.py b/v2/inference/claude.py
@@ -0,0 +1,146 @@
+import json
+import time
+import os
+import base64
+import requests
+import argparse
+import anthropic
+
+
+# Function to encode the image
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+
+
+class Claude:
+    def __init__(self, api_key, 
+                 model="claude-3-5-sonnet-20240620", temperature=0.0,
+                 max_tokens=512, system=None):
+        self.model = model
+        self.client = anthropic.Anthropic(
+            api_key=api_key,
+        )
+        self.system = system
+        self.temperature = temperature
+        self.max_tokens = max_tokens
+
+    def get_response(self, image_folder, prompt="What's in this image?"):
+        messages = []
+        content = []
+        queries = prompt.split("<IMG>")
+        img_num = 0
+        for query in queries:
+            query = query.strip()
+            if query == "":
+                continue
+            if query.endswith((".jpg", ".png", ".jpeg")):
+                image_path = os.path.join(image_folder, query)
+                base64_image = encode_image(image_path)
+                image_format = "png" if image_path.endswith('.png') else "jpeg"
+                content.append(
+                    {
+                        "type": "image",
+                        "source": {
+                            "type": "base64",
+                            "media_type": f"image/{image_format}",
+                            "data": base64_image,
+                        }
+                    }
+                )
+                img_num += 1
+            else:
+                content.append(
+                    {
+                        "type": "text",
+                        "text": query
+                    },
+                )
+
+        messages.append({
+            "role": "user",
+            "content": content,
+        })
+
+        payload = {
+            "model": self.model,
+            "messages": messages,
+            "max_tokens": self.max_tokens,
+            "temperature": self.temperature,
+        }
+
+        if self.system:
+            payload["system"] = self.system
+
+        response = self.client.messages.create(**payload)
+        response_text = response.content[0].text
+        return response_text
+
+
+def arg_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--mmvetv2_path",
+        type=str,
+        default="/path/to/mm-vet-v2",
+        help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here",
+    )
+    parser.add_argument(
+        "--result_path",
+        type=str,
+        default="results",
+    )
+    parser.add_argument(
+        "--anthropic_api_key", type=str, default=None,
+        help="refer to https://platform.openai.com/docs/quickstart?context=python"
+    )
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        default="claude-3-5-sonnet-20240620",
+        help="Claude model name",
+    )
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = arg_parser()
+    model_name = args.model_name
+    if os.path.exists(args.result_path) is False:
+        os.makedirs(args.result_path)
+    results_path = os.path.join(args.result_path, f"{model_name}.json")
+    image_folder = os.path.join(args.mmvetv2_path, "images")
+    meta_data = os.path.join(args.mmvetv2_path, "mm-vet-v2.json")
+
+
+    if args.anthropic_api_key:
+        ANTHROPIC_API_KEY = args.anthropic_api_key
+    else:
+        ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY')
+
+    if ANTHROPIC_API_KEY is None:
+        raise ValueError("Please set the ANTHROPIC_API_KEY environment variable or pass it as an argument")
+
+    claude = Claude(ANTHROPIC_API_KEY, model=model_name)
+
+    if os.path.exists(results_path):
+        with open(results_path, "r") as f:
+            results = json.load(f)
+    else:
+        results = {}
+
+    with open(meta_data, "r") as f:
+        data = json.load(f)
+
+    for id in data:
+        if id in results:
+            continue
+        prompt = data[id]["question"].strip()
+        print(id)
+        print(f"Prompt: {prompt}")
+        response = claude.get_response(image_folder, prompt)
+        print(f"Response: {response}")
+        results[id] = response
+        with open(results_path, "w") as f:
+            json.dump(results, f, indent=4)
diff --git a/v2/inference/gemini.py b/v2/inference/gemini.py
@@ -0,0 +1,94 @@
+import os
+import time
+from pathlib import Path
+import argparse
+import json
+import google.generativeai as genai
+from utils import evaluate_on_mmvetv2
+
+
+class Gemini:
+    def __init__(self, model="gemini-1.5-pro"):
+        self.model = genai.GenerativeModel(model)
+
+    def get_response(self, image_folder, prompt="What's in this image?") -> str:
+
+        content = []
+        queries = prompt.split("<IMG>")
+        img_num = 0
+        for query in queries:
+            if query.endswith((".jpg", ".png", ".jpeg")):
+                image_path = Path(os.path.join(image_folder, query))
+                image = {
+                    'mime_type': f'image/{image_path.suffix[1:].replace("jpg", "jpeg")}',
+                    'data': image_path.read_bytes()
+                }
+                img_num += 1
+                content.append(image)
+            else:
+                content.append(query)
+
+        if img_num > 16:
+            return ""
+        # Query the model
+        text = ""
+        while len(text) < 1:
+            try:
+                response = self.model.generate_content(
+                    content
+                )
+                try:
+                    text = response.text
+                except:
+                    text = " "
+            except Exception as error:
+                print(error)
+                print('Sleeping for 10 seconds')
+                time.sleep(10)
+        return text.strip()
+
+
+def arg_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--mmvetv2_path",
+        type=str,
+        default="/path/to/mm-vet",
+        help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here",
+    )
+    parser.add_argument(
+        "--result_path",
+        type=str,
+        default="results",
+    )
+    parser.add_argument(
+        "--google_api_key", type=str, default=None,
+        help="refer to https://ai.google.dev/tutorials/python_quickstart"
+    )
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        default="gemini-1.5-pro",
+        help="Gemini model name",
+    )
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = arg_parser()
+
+    if args.google_api_key:
+        GOOGLE_API_KEY = args.google_api_key
+    else:
+        GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
+
+    if GOOGLE_API_KEY is None:
+        raise ValueError("Please set the GOOGLE_API_KEY environment variable or pass it as an argument")
+
+    genai.configure(api_key=GOOGLE_API_KEY)
+    model = Gemini(model=args.model_name)
+
+    evaluate_on_mmvetv2(args, model)
+
+