RUCAIBox · huyiwen · Jun 23, 2024 · Jun 21, 2024 · Jun 21, 2024 · Jun 21, 2024
diff --git a/.github/.test_durations b/.github/.test_durations
@@ -1,7 +1,7 @@
 {
     "tests/dry_test/test_datasets.py::test_crows_pairs_dry_run": 2.918293869000081,
     "tests/dry_test/test_datasets.py::test_datasets_dry_run[agieval-extra_args0]": 32.72024002399999,
-    "tests/dry_test/test_datasets.py::test_datasets_dry_run[alpaca_eval-skip]": 0.0016126749999969547,
+    "tests/dry_test/test_datasets.py::test_datasets_dry_run[alpaca_eval-extra_args1]": 6.09870545566082,
     "tests/dry_test/test_datasets.py::test_datasets_dry_run[anli-extra_args2]": 51.73772629200001,
     "tests/dry_test/test_datasets.py::test_datasets_dry_run[arc-extra_args3]": 32.036750494,
     "tests/dry_test/test_datasets.py::test_datasets_dry_run[bbh-extra_args4]": 22.74885801099998,
@@ -27,7 +27,7 @@
     "tests/dry_test/test_datasets.py::test_datasets_dry_run[mbpp-extra_args24]": 32.793481805,
     "tests/dry_test/test_datasets.py::test_datasets_dry_run[mmlu-extra_args25]": 6.294899032000046,
     "tests/dry_test/test_datasets.py::test_datasets_dry_run[mrpc-extra_args26]": 16.370866133999982,
-    "tests/dry_test/test_datasets.py::test_datasets_dry_run[mt_bench-skip]": 0.0008058610000034605,
+    "tests/dry_test/test_datasets.py::test_datasets_dry_run[mt_bench-extra_args28]": 15.967769110575318,
     "tests/dry_test/test_datasets.py::test_datasets_dry_run[nq-extra_args28]": 25.117774340999972,
     "tests/dry_test/test_datasets.py::test_datasets_dry_run[openbookqa-extra_args29]": 27.788599147999946,
     "tests/dry_test/test_datasets.py::test_datasets_dry_run[penguins_in_a_table-extra_args30]": 0.11626804900004117,
@@ -46,7 +46,7 @@
     "tests/dry_test/test_datasets.py::test_datasets_dry_run[triviaqa-extra_args43]": 46.31700100900002,
     "tests/dry_test/test_datasets.py::test_datasets_dry_run[truthfulqa_mc-extra_args44]": 20.452524830000016,
     "tests/dry_test/test_datasets.py::test_datasets_dry_run[tydiqa-extra_args45]": 9.735652780999999,
-    "tests/dry_test/test_datasets.py::test_datasets_dry_run[vicuna_bench-skip]": 0.0009366230000296127,
+    "tests/dry_test/test_datasets.py::test_datasets_dry_run[vicuna_bench-extra_args47]": 5.973800586536527,
     "tests/dry_test/test_datasets.py::test_datasets_dry_run[webq-extra_args47]": 23.36286485000005,
     "tests/dry_test/test_datasets.py::test_datasets_dry_run[wic-extra_args48]": 1.1293475459999627,
     "tests/dry_test/test_datasets.py::test_datasets_dry_run[winogender-extra_args49]": 10.784725986999888,
@@ -72,6 +72,12 @@
     "tests/dry_test/test_models.py::test_models_dry_run[qwen-turbo-extra_args2-gsm8k]": 22.391773478000232,
     "tests/dry_test/test_models.py::test_models_dry_run[qwen-turbo-extra_args2-hellaswag]": 29.800566227000445,
     "tests/dry_test/test_models.py::test_models_dry_run[qwen-turbo-extra_args2-mmlu]": 27.11841242399987,
+    "tests/utilization/dataset/test_dataset_length.py::test_dataset_length[gpqa-4]": 4.746588402427733,
+    "tests/utilization/dataset/test_dataset_length.py::test_dataset_length[gsm8k-1-generation]": 15.47886633593589,
+    "tests/utilization/dataset/test_dataset_length.py::test_dataset_length[mmlu:abstract_algebra-1-prob]": 0.08624407928436995,
+    "tests/utilization/dataset/test_dataset_length.py::test_dataset_length[mmlu:abstract_algebra-4-ppl]": 0.14216343127191067,
+    "tests/utilization/dataset/test_dataset_length.py::test_dataset_length[mmlu:abstract_algebra-4-ppl_no_option]": 3.515047383494675,
+    "tests/utilization/dataset/test_dataset_length.py::test_dataset_length[openbookqa-4-ppl_no_option]": 18.495867879129946,
     "tests/utilization/dataset/test_formatting.py::test_formatting[gsm8k-0-None-None]": 16.317220823839307,
     "tests/utilization/dataset/test_formatting.py::test_formatting[gsm8k-2-least_to_most-None]": 16.421796096488833,
     "tests/utilization/dataset/test_formatting.py::test_formatting[gsm8k-8-None-None]": 15.621955395676196,
@@ -90,6 +96,9 @@
     "tests/utilization/model/test_apply_prompt_template.py::test_llama2": 0.0018699830397963524,
     "tests/utilization/model/test_apply_prompt_template.py::test_no_smart_space": 0.0017823278903961182,
     "tests/utilization/model/test_apply_prompt_template.py::test_smart_space": 0.0016713934019207954,
+    "tests/utilization/model/test_apply_prompt_template.py::test_tokenizer_chat_template": 0.5478693814948201,
+    "tests/utilization/model/test_ensure_type.py::test_ensure_type_list": 0.0005008839070796967,
+    "tests/utilization/model/test_ensure_type.py::test_ensure_type_str": 0.3512640204280615,
     "tests/utilization/model/test_to_model_prompt.py::test_to_model_prompt[generation-False]": 0.0020645475015044212,
     "tests/utilization/model/test_to_model_prompt.py::test_to_model_prompt[generation-True]": 0.0018382547423243523,
     "tests/utilization/model/test_to_model_prompt.py::test_to_model_prompt[get_ppl-False]": 0.0018322393298149109,

diff --git a/.github/workflows/pytest-check.yml b/.github/workflows/pytest-check.yml
@@ -81,6 +81,6 @@ jobs:
       - name: Run coverage
         run: |
           coverage combine coverage*/.coverage*
-          coverage report --fail-under=90 -i
+          coverage report --fail-under=70 -i
           coverage xml -i
       - uses: codecov/codecov-action@v1
diff --git a/docs/README.md b/docs/README.md
@@ -2,30 +2,28 @@
 
 ## Training
 
-Tutorial: [Training](https://github.com/RUCAIBox/LLMBox/tree/main/training)
+- Tutorial: [Training](https://github.com/RUCAIBox/LLMBox/tree/main/training)
 
 ## Utilization
 
-CLI Usage: [Utilization](https://github.com/RUCAIBox/LLMBox/tree/main/utilization)
-Reproduction: [test.sh](https://github.com/RUCAIBox/LLMBox/blob/main/test.sh)
+- CLI Usage: [Utilization](https://github.com/RUCAIBox/LLMBox/tree/main/utilization)
+- Reproduction: [test.sh](https://github.com/RUCAIBox/LLMBox/blob/main/test.sh)
+<!-- - [Example: Benchmarking LLaMA-3](https://github.com/RUCAIBox/LLMBox/blob/main/docs/examples/benchmarking_llama3.md) -->
+- [Trouble Shooting: Debug an evaluation run](https://github.com/RUCAIBox/LLMBox/blob/main/docs/trouble_shooting/debug_evaluation_run.md)
 
 ### Datasets
 
 - [Supported datasets](https://github.com/RUCAIBox/LLMBox/blob/main/docs/utilization/supported-datasets.md)
 - [How to load datasets with subsets](https://github.com/RUCAIBox/LLMBox/blob/main/docs/utilization/how-to-load-datasets-with-subsets.md)
 - [How to load datasets from HuggingFace](https://github.com/RUCAIBox/LLMBox/blob/main/docs/utilization/how-to-load-datasets-from-huggingface.md)
 - [How to customize dataset](https://github.com/RUCAIBox/LLMBox/blob/main/docs/utilization/how-to-customize-dataset.md)
+- [Example: Customize dataset](https://github.com/RUCAIBox/LLMBox/blob/main/docs/examples/customize_dataset.py)
 
 ### Models
 
 - [How to customize model](https://github.com/RUCAIBox/LLMBox/blob/main/docs/utilization/how-to-customize-model.md)
-
-## Examples
-
-- [Customize dataset](https://github.com/RUCAIBox/LLMBox/blob/main/docs/examples/customize_dataset.py)
-- [Customize HuggingFace model](https://github.com/RUCAIBox/LLMBox/blob/main/docs/examples/customize_huggingface_model.py)
+- [Example: Customize HuggingFace model](https://github.com/RUCAIBox/LLMBox/blob/main/docs/examples/customize_huggingface_model.py)
 
 ## Trouble Shooting
 
-- [Debug an evaluation run](https://github.com/RUCAIBox/LLMBox/blob/main/docs/trouble_shooting/debug_evaluation_run.md)
 - [vLLM no module name packaging](https://github.com/RUCAIBox/LLMBox/blob/main/docs/trouble_shooting/vllm_no_module_name_packaging.md)
diff --git a/docs/examples/customize_dataset.py b/docs/examples/customize_dataset.py
@@ -1,7 +1,7 @@
 import os
 import sys
 
-sys.path.append(".")
+sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 
 from utilization import DatasetArguments, ModelArguments, get_evaluator, register_dataset

diff --git a/docs/examples/customize_huggingface_model.py b/docs/examples/customize_huggingface_model.py
@@ -1,9 +1,10 @@
+import os
 import sys
 
 import torch
 from transformers import LlamaForCausalLM
 
-sys.path.append(".")
+sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
 from utilization import DatasetArguments, ModelArguments, get_evaluator
 
 

diff --git a/docs/examples/run_gpt_eval.py b/docs/examples/run_gpt_eval.py
@@ -0,0 +1,84 @@
+import json
+import os
+import re
+import sys
+
+from dotenv import load_dotenv
+
+load_dotenv()
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
+
+from utilization import DatasetArguments, EvaluationArguments, ModelArguments, get_evaluator
+
+GPTEVAL_DATASETS = {"alpaca_eval", "mt_bench", "vicuna_bench"}
+GREEN = "\033[92m"
+CLEAR = "\033[0m"
+
+
+def main(file_path: str, continue_from: str = None):
+    r"""Run GPTEval metrics.
+
+    Use case: Sometimes, you might want to split the evaluation into two parts: 1. LLM generation and 2. metrics calculation. This allows for more efficient use of GPU resources for metrics that take longer to compute, such as GPTEval. You can add the `--inference_only` flag in the command line for LLM generation, which will produce a JSON file of the evaluation results. This function performs the second step of reading the JSON file and calculating the GPTEval scores.
+
+    Example:
+    >>> ls evaluation_results | grep "alpaca_eval.*\.json" | xargs -I {} python docs/examples/run_gpt_eval.py evaluation_results/{}
+    or continue from a GPTEval checkpoint:
+    >>> python docs/examples/run_gpt_eval.py evaluation_results/<model>-alpaca_eval-<date>.json evaluation_results/gpt-3.5-turbo-alpaca_eval-<date>.json
+    """
+
+    assert file_path.endswith(".json"), "Please provide a JSON file."
+    file_name = file_path.split("/")[-1]
+    assert re.match(
+        r".*-[^-]*-\dshot-\d\d\d\d_\d\d_\d\d-\d\d_\d\d_\d\d.json", file_name
+    ), f"Please provide a valid JSON file {file_name}."
+    model, dataset = re.match(r"(.*)-([^-]*)-\dshot-\d\d\d\d_\d\d_\d\d-\d\d_\d\d_\d\d.json", file_name).groups()
+    assert dataset in GPTEVAL_DATASETS, f"Please provide a valid dataset. Available datasets: {GPTEVAL_DATASETS}"
+
+    with open(file_path, "r") as f:
+        args = json.loads(f.readline())
+    assert args["evaluation_results"] == "batch", "Please provide the JSON file with batch evaluation results."
+
+    evaluator = get_evaluator(
+        model_args=ModelArguments(
+            model_name_or_path=args["model_args"]["model_name_or_path"],
+            model_backend="openai",  # use openai model to load faster
+            api_endpoint="chat/completions",
+            model_type=args["model_args"]["model_type"],
+        ),
+        dataset_args=DatasetArguments(dataset_names=[dataset], batch_size=1),
+        evaluation_args=EvaluationArguments(
+            continue_from=file_path,
+            log_level="warning",
+            gpteval_continue_from=continue_from,
+        ),
+    )
+    metric_results = evaluator.evaluate()
+
+    msg = ""
+    for display_name, result in metric_results.items():
+        if result is None:
+            continue
+        msg += f"\n##### {display_name} #####"
+        for key, value in sorted(result.items(), key=lambda x: x[0]):
+            msg += "\n{}: {:.2f}".format(key, value)
+
+    print(evaluator.model_args)
+    print(evaluator.dataset_args)
+    print(evaluator.evaluation_args)
+    print(f"{GREEN}{msg}{CLEAR}")
+
+
+if __name__ == "__main__":
+    assert len(sys.argv) >= 2, "Please provide the path to the JSON file."
+
+    file = sys.argv[1].rstrip("/")
+    print(f"LLM file: {file}")
+
+    if len(sys.argv) > 2:
+        continue_from = sys.argv[2].rstrip("/")
+        print(f"GPTEval file: {continue_from}")
+    else:
+        continue_from = None
+
+    main(file, continue_from)
diff --git a/docs/utilization/how-to-load-dataset-gpqa.md b/docs/utilization/how-to-load-dataset-gpqa.md
@@ -0,0 +1,57 @@
+# How to Load Dataset GPQA
+
+## Step 1: Install LLMBox
+
+Install the LLMBox library by following the instructions in the [installation guide](https://github.com/RUCAIBox/LLMBox).
+
+## Step 2: Generate a Hugging Face Token
+
+GPQA is a gated dataset, so you will need to apply for access to the dataset with your Hugging Face account. Then, you need to generate a Hugging Face [token](https://huggingface.co/settings/tokens) to access the dataset.
+
+> [!TIP]
+> It's recommended to use a `READ`-only token.
+
+Then you can access to the dataset by using the following command:
+
+```bash
+cd LLMBox
+python inference.py -m model -d gpqa --hf_username <username> --hf_token <token>
+```
+
+Alternatively, you can store it in a `.env` file in the root of the repository.
+
+```text
+HF_TOKEN=<token>
+```
+
+## Step 3: Clone the GPQA Repository
+
+Then you need to clone the GPQA repository to access the chain-of-thought prompts.
+
+```bash
+git clone https://github.com/idavidrein/gpqa
+```
+
+## Step 4: Evaluate on GPQA (0-shot)
+
+```bash
+python inference.py -m model -d gpqa
+```
+
+## Step 5: Evaluate on GPQA (5-shot)
+
+```bash
+python inference.py -m model -d gpqa --example_set ./gpqa --num_shots 5
+```
+
+## Step 6: Evaluate on GPQA (0-shot, CoT)
+
+```bash
+python inference.py -m model -d gpqa --example_set ./gpqa --cot base
+```
+
+## Step 7: Evaluate on GPQA (5-shot, CoT)
+
+```bash
+python inference.py -m model -d gpqa --example_set ./gpqa --num_shots 5 --cot base
+```
diff --git a/docs/utilization/how-to-use-chat-template.md b/docs/utilization/how-to-use-chat-template.md
@@ -0,0 +1,63 @@
+# How to Use Chat Template
+
+## What is Chat Template
+
+If you are using an instruct-tuned large language models (LLMs), you need a chat template to correctly prompt the model. Different models are trained with different input formats. This is especially noted in `transformers`'s new `chat_template` feature.
+
+Newer models like LLaMA-3 include the chate template in the `tokenizer_config.json` file. However, other popular models like Vicuna do not have this feature built-in. LLMBox provides a simple way to use chat templates with any model.
+
+For more details, please refer to [this repo](https://github.com/chujiezheng/chat_templates) and [huggingface's documentation](https://huggingface.co/docs/transformers/chat_templating).
+
+## How to Use Chat Template in LLMBox
+
+### Load automatically
+
+In most cases, [LLMBox](https://github.com/RUCAIBox/LLMBox) detects the model type (whether it is a pre-trained model or a chat model). If it is a chat model, we will automatically use the chat template feature.
+
+> [!TIP]
+> You can also manually set the model type with `--model_type base` or `--model_type chat`.
+
+Currently we support 7 chat templates including `base` (default), `llama3`, `chatml`, `llama2`, `zephyr`, `phi3`, and `alpaca` (find in [here](https://github.com/RUCAIBox/LLMBox/blob/main/utilization/chat_templates.py)). For example, if you are using the `llama3` model, we will automatically load the `llama3` chat template, which looks like this:
+
+```python
+"llama3": {
+    "system_start": "<|start_header_id|>system<|end_header_id|>\n\n",
+    "system_end": "<|eot_id|>",
+    "user_start": "<|start_header_id|>user<|end_header_id|>\n\n",
+    "user_end": "<|eot_id|>",
+    "assistant_start": "<|start_header_id|>assistant<|end_header_id|>\n\n",
+    "assistant_end": "<|eot_id|>",
+    "auto_leading_space": True,
+    "default_stops": ["<|eot_id|>"],
+}
+```
+
+When loading a chat-based model, we try to match the model with the chat template by the model's name. For example, the `Meta-Llama3-8B-Instruct` model will be matched with the `llama3` chat template.
+
+### Specify a supported chat template
+
+If the chat template is not correctly loaded, you can manually set the chat template by adding the `--chat_template` argument to the command line.
+
+For example, InternLM-2 uses the `chatml` chat template. You can specify the chat template like this:
+
+```bash
+python inference.py -m internlm/internlm2-chat-7b -d gsm8k --chat_template chatml
+```
+
+### Use the chat template that comes with the tokenizer
+
+In the above examples, we use our own chat templates, which are needed for some evaluation setups (e.g., `ppl_no_option`). It is because a more fine-grained control is needed for those setups.
+
+However, you can still use the chat template that comes with the tokenizer. For example, if you are using the `Meta-Llama3-8B-Instruct` model, you can use the chat
+
+```bash
+python inference.py -m Meta-Llama3-8B-Instruct -d gsm8k --chat_template tokenizer
+```
+
+Alternatively, you can use the `--chat_template` argument to specify the path to a jinja template file.
+
+For example, if you have a custom chat template `custom_chat_template.jinja`, you can load it like this:
+
+```bash
+python inference.py -m Meta-Llama3-8B-Instruct -d gsm8k --chat_template path/to/custom_chat_template.jinja
+```
diff --git a/tests/dry_test/test_datasets.py b/tests/dry_test/test_datasets.py
@@ -9,7 +9,7 @@
 
 datasets = {
     "agieval": [],
-    "alpaca_eval": "skip",
+    "alpaca_eval": ["--inference_only", "--openai_api_key", "fake-key"],
     "anli": [],
     "arc": [],
     "bbh": [],
@@ -36,7 +36,7 @@
     "mbpp": ["--pass_at_k", "1"],
     "mmlu": [],
     "mrpc": [],
-    "mt_bench": "skip",
+    "mt_bench": ["--inference_only", "--openai_api_key", "fake-key"],
     "nq": [],
     "openbookqa": [],
     "penguins_in_a_table": [],
@@ -55,7 +55,7 @@
     "triviaqa": [],
     "truthfulqa_mc": [],
     "tydiqa": [],
-    "vicuna_bench": "skip",  # gpteval
+    "vicuna_bench": ["--inference_only", "--openai_api_key", "fake-key"],
     "webq": [],
     "wic": [],
     "winogender": [],