Improvements on benchmark display and usage (sotopia-lab#135)

* add stdev for stats, and also provide a benchmark script * fix mypy issue * add 95% CI * use numpy instead of scipy in CI * clean up code & add doc for benchmark * [autofix.ci] apply automated fixes * code cleanup * minor bug fix and code cleanup * refactor benchmark code * add more models * fix mypy * remove binary option * remove extra binary * change the stop criteria to be more accurate * add test cases for benchmark, also fix small t value bug * [autofix.ci] apply automated fixes * fix issue of episode not found * fix mypy error * [autofix.ci] apply automated fixes * add test cases for benchmark, and minor code changes * remove unnecessary print code * add coverage by actually running one time and adding more mock data * fix autofix error * fix patching issues * add unit tests for more arguments * fix test_get_agent_by_name issue * make the test stricter --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: XuhuiZhou <zhouxuhui2018@gmail.com>
petalzx · Aug 26, 2024 · 850d441 · 850d441
1 parent 77a97f8
commit 850d441
Show file tree

Hide file tree

Showing 8 changed files with 737 additions and 254 deletions.
diff --git a/docs/pages/examples/benchmark.md b/docs/pages/examples/benchmark.md
@@ -1,11 +1,14 @@
 # Benchmark your model as a social agent in Sotopia
 
 ```
-sotopia_benchmark --model=<your_model_name>
+sotopia benchmark --models <model1> --models <model2> [--only-show-performance]
 ```
 or
 
 ```
-python sotopia/benchmark/cli.py --model=<your_model_name>
+python sotopia/cli/benchmark/benchmark.py --models <model1> --models <model2> [--only-show-performance]
 ```
+When `only-show-performance` is speficied, only model results with available episodes will be displayed. If this option is not used, the benchmark will be run.
 Currently this script would run over 100 simulations on the Sotopia Hard tasks. And the partner model is fixed to be `meta-llama/Llama-3-70b-chat-hf`
+
+An example script is provided in `scripts/display_benchmark_results.sh`
diff --git a/examples/benchmark_evaluator.py b/examples/benchmark_evaluator.py
@@ -58,7 +58,9 @@ def get_dimension_correlation(
         int(not isinstance(relevant_episode.rewards[0], float))
         for relevant_episode in machine_annotations
     ]
-    assert sum(episodes_with_valid_rewards) == len(human_annotations), "Data is missing"
+    assert (
+        sum(episodes_with_valid_rewards) == len(human_annotations)
+    ), f"Data is missing, episodes with valid rewards: {sum(episodes_with_valid_rewards)}, human annotations: {len(human_annotations)}"
     overall = dimension == "overall"
     dimension_scores_agent1_human = get_dimension_scores(
         0, human_annotations, dimension, overall
@@ -116,6 +118,9 @@ def evaluate_evaluator(
         EpisodeLog.tag == tag
     ).all()  # type: ignore
     if len(re_evaluated_episodes) < len(to_re_evaluate_list):
+        print(
+            f"Existing data: {len(re_evaluated_episodes)}, expected: {len(to_re_evaluate_list)}, running evaluation"
+        )
         run_async_server_in_batch_aevaluate(
             tag=tag,
             model=model,  # type: ignore
@@ -143,6 +148,8 @@ def evaluate_evaluator(
                 verbose=verbose,
                 reeval_list=to_re_evaluate_list,
             )
+            for pk in to_re_evaluate_list:
+                EpisodeLog.delete(pk)
             to_re_evaluate_list = []
             re_evaluated_episodes = EpisodeLog.find(EpisodeLog.tag == tag).all()  # type: ignore
             valid_episodes = [
@@ -151,9 +158,8 @@ def evaluate_evaluator(
             ]
             for valid, episode in zip(valid_episodes, re_evaluated_episodes):
                 if not valid:
-                    pk = episode.pk
+                    pk = episode.pk  # type: ignore
                     assert isinstance(pk, str)
-                    EpisodeLog.delete(pk)
                     to_re_evaluate_list.append(pk)
 
     correlation_list = []

diff --git a/examples/generate_scenarios.py b/examples/generate_scenarios.py
@@ -53,7 +53,7 @@ def check_existing_envs(
 
 def generate_newenv_profile(
     num: int,
-    gen_model: LLM_Name = "gpt-4-turbo",
+    gen_model: LLM_Name = "gpt-4-turbo-2024-04-09",
     temperature: float = 0.5,
     type: str = "craigslist_bargains",
 ) -> pd.DataFrame:

diff --git a/scripts/display_benchmark_results.sh b/scripts/display_benchmark_results.sh
@@ -0,0 +1,6 @@
+sotopia benchmark --only-show-performance \
+--models gpt-4o \
+--models together_ai/mistralai/Mixtral-8x22B-Instruct-v0.1 \
+--models gpt-3.5-turbo  \
+--models together_ai/meta-llama/Llama-3-70b-chat-hf \
+--models together_ai/meta-llama/Llama-3-8b-chat-hf