[Automated] Merge release into main (sotopia-lab#199)

* bump the version, test release to PyPi * Update README.md * Update README.md * Update README.md * bumpy version to 0.0.9 * Update Sotopia presentation information in README.md * bump version to 0.0.10 * bump version * add merge release back to main action * change checkout v4->v3 * fix merge-back-to-main and pin mypy to <1.11.0 * merge bug fix * upgrade default model to handle bad-foratted outputs to gpt-4o-mini as gpt-3.5-turbo is deprecated (sotopia-lab#183) * update pull request -> pull request target * bump version * Add `bad_output_process_model` option and `use_fixed_model_version` option for all generation methods, to avoid future OpenAI API changes break Sotopia running. (sotopia-lab#196) * Two major updates: 1) add "bad_output_process_model" option to all `agenerate_xxx()` methods so users can decide which model to use for handling bad outputs. By default, this is set to be `gpt-4o-mini`. 2) add `use_fixed_model_version` option for all generation methods, as some fixed model version may no longer available in the future. Users should have the right to bypass the fixed model version mapping instead of getting stuck in an error. Document (`generation.md`) has been updated for these two major changes correspondingly. * [autofix.ci] apply automated fixes --------- Co-authored-by: Chenghao Yang <yangalan1996@gmail.com> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> * fix gpt-3.5 * replace gpt3.5 turbo for tests * update gpt-3.5-turbo to gpt-4o-mini * bug fix for return fixed model version function --------- Co-authored-by: XuhuiZhou <zhouxuhui2018@gmail.com> Co-authored-by: Chenghao (Alan) Yang <chenghao@uchicago.edu> Co-authored-by: Chenghao Yang <yangalan1996@gmail.com> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
petalzx · Oct 1, 2024 · 785669c · 785669c
1 parent 4bf087d
commit 785669c
Show file tree

Hide file tree

Showing 18 changed files with 118 additions and 46 deletions.
diff --git a/README.md b/README.md
@@ -74,8 +74,8 @@ asyncio.run(
     run_async_server(
         model_dict={
             "env": "gpt-4",
-            "agent1": "gpt-3.5-turbo",
-            "agent2": "gpt-3.5-turbo",
+            "agent1": "gpt-4o-mini",
+            "agent2": "gpt-4o-mini",
         },
         sampler=UniformSampler(),
     )

diff --git a/docs/pages/concepts/agents.md b/docs/pages/concepts/agents.md
@@ -11,7 +11,7 @@ class LLMAgent(BaseAgent[Observation, AgentAction]):
         agent_name: str | None = None,
         uuid_str: str | None = None,
         agent_profile: AgentProfile | None = None,
-        model_name: str = "gpt-3.5-turbo",
+        model_name: str = "gpt-4o-mini",
         script_like: bool = False,
     ) -> None:
 ```
@@ -26,7 +26,7 @@ class ScriptWritingAgent(LLMAgent):
         agent_name: str | None = None,
         uuid_str: str | None = None,
         agent_profile: AgentProfile | None = None,
-        model_name: str = "gpt-3.5-turbo",
+        model_name: str = "gpt-4o-mini",
         agent_names: list[str] = [],
         background: ScriptBackground | None = None,
     ) -> None:

diff --git a/docs/pages/concepts/generation.md b/docs/pages/concepts/generation.md
@@ -12,6 +12,8 @@ async def agenerate(
     output_parser: BaseOutputParser[OutputType],
     temperature: float = 0.7,
     structured_output: bool = False,
+    bad_output_process_model: str = DEFAULT_BAD_OUTPUT_PROCESS_MODEL,
+    use_fixed_model_version: bool = True
 ) -> OutputType:
     input_variables = re.findall(r"(?<!{){([^{}]+)}(?!})", template)
 ```
@@ -23,6 +25,12 @@ The `agenerate` function is versatile by taking the output_parser as an argument
     * `gpt-4o-mini-2024-07-18` and later
     * `gpt-4o-2024-08-06` and later
 
+The `bad_output_process_model` is used to process the bad output. `DEFAULT_BAD_OUTPUT_PROCESS_MODEL` is set to be `gpt-4o-mini` (At the publication time of Sotopia, we used `gpt-3.5-turbo-0613`. However this model has been taken off the shelf by OpenAI.).
+
+The `use_fixed_model_version` is used to determine whether to use the fixed model version. If set to `True`, the model version will be fixed to the version that was used in Sotopia paper. If set to `False`, the model version will be the latest version available.
+
+Warning: As some fixed model versions might not be available in the OpenAI API, setting `use_fixed_model_version = True` might result in an error.
+
 </Callout>
 
 Here are a few examples of how to use the `agenerate` function:
@@ -37,6 +45,8 @@ async def agenerate_env_profile(
     inspiration_prompt: str = "asking my boyfriend to stop being friends with his ex",
     examples: str = "",
     temperature: float = 0.7,
+    bad_output_process_model: str = DEFAULT_BAD_OUTPUT_PROCESS_MODEL,
+    use_fixed_model_version: bool = True
 ) -> tuple[EnvironmentProfile, str]:
     """
     Using langchain to generate the background
@@ -56,6 +66,8 @@ async def agenerate_env_profile(
         ),
         output_parser=PydanticOutputParser(pydantic_object=EnvironmentProfile),
         temperature=temperature,
+        bad_output_process_model=bad_output_process_model,
+        use_fixed_model_version=use_fixed_model_version
     )
 ```
 ### Other generation functions
@@ -66,6 +78,8 @@ Similarly, there are other utility functions that builds upon the `agenerate` fu
 async def agenerate_relationship_profile(
     model_name: str,
     agents_profiles: list[str],
+    bad_output_process_model: str = DEFAULT_BAD_OUTPUT_PROCESS_MODEL,
+    use_fixed_model_version: bool = True
 ) -> tuple[RelationshipProfile, str]
 ```
 
@@ -78,5 +92,7 @@ async def agenerate_script(
     agent_name: str = "",
     history: str = "",
     single_step: bool = False,
+    bad_output_process_model: str = DEFAULT_BAD_OUTPUT_PROCESS_MODEL,
+    use_fixed_model_version: bool = True
 ) -> tuple[ScriptInteractionReturnType, str]
 ```
diff --git a/docs/pages/index.mdx b/docs/pages/index.mdx
@@ -206,8 +206,8 @@ asyncio.run(
     run_async_server(
         model_dict={
             "env": "gpt-4",
-            "agent1": "gpt-3.5-turbo",
-            "agent2": "gpt-3.5-turbo",
+            "agent1": "gpt-4o-mini",
+            "agent2": "gpt-4o-mini",
         },
         sampler=UniformSampler(),
     )

diff --git a/examples/benchmark_evaluator.py b/examples/benchmark_evaluator.py
@@ -15,8 +15,8 @@
 
 target_model_patterns: list[list[str]] = [
     ["gpt-4", "gpt-4", "gpt-3.5-turbo"],
-    ["gpt-4", "gpt-3.5-turbo", "gpt-4"],
-    ["gpt-4", "gpt-3.5-turbo", "togethercomputer/llama-2-70b-chat"],
+    ["gpt-4", "gpt-4o-mini", "gpt-4"],
+    ["gpt-4", "gpt-4o-mini", "togethercomputer/llama-2-70b-chat"],
     ["gpt-4", "togethercomputer/llama-2-70b-chat", "gpt-3.5-turbo"],
 ]
 

diff --git a/examples/experiment_eval.py b/examples/experiment_eval.py
@@ -170,8 +170,8 @@ def run_async_server_in_batch(
     batch_size: int = 1,
     model_names: dict[str, LLM_Name] = {
         "env": "gpt-4",
-        "agent1": "gpt-3.5-turbo",
-        "agent2": "gpt-3.5-turbo",
+        "agent1": "gpt-4o-mini",
+        "agent2": "gpt-4o-mini",
     },
     tag: str | None = None,
     verbose: bool = False,

diff --git a/examples/fix_missing_episodes.py b/examples/fix_missing_episodes.py
@@ -252,8 +252,8 @@ def re_run_missing_episodes(
     combo_with_models: dict[tuple[LLM_Name, LLM_Name], list[tuple[str, str, str]]],
     model_names: dict[str, LLM_Name] = {
         "env": "gpt-4",
-        "agent1": "gpt-3.5-turbo",
-        "agent2": "gpt-3.5-turbo",
+        "agent1": "gpt-4o-mini",
+        "agent2": "gpt-4o-mini",
     },
     batch_size: int = 5,
     verbose: bool = False,

diff --git a/examples/fix_missing_episodes_with_tag.py b/examples/fix_missing_episodes_with_tag.py
@@ -350,8 +350,8 @@ def re_run_missing_episodes(
     env_agent_ids: List[Tuple[str, str, str]] = [],
     model_names: dict[str, LLM_Name] = {
         "env": "gpt-4",
-        "agent1": "gpt-3.5-turbo",
-        "agent2": "gpt-3.5-turbo",
+        "agent1": "gpt-4o-mini",
+        "agent2": "gpt-4o-mini",
     },
     batch_size: int = 5,
     rerun_tag: str = "missing_episodes",

diff --git a/examples/generate_script.py b/examples/generate_script.py
@@ -175,7 +175,7 @@ def full_freeform(
 def run_async_server_in_batch_script(
     *,
     batch_size: int = 10,
-    model: LLM_Name = "gpt-3.5-turbo",
+    model: LLM_Name = "gpt-4o-mini",
     tag: str | None = None,
     push_to_db: bool = True,
     json_in_script: bool = False,

diff --git a/examples/minimalist_demo.py b/examples/minimalist_demo.py
@@ -28,8 +28,8 @@
     run_async_server(
         model_dict={
             "env": "gpt-4",
-            "agent1": "gpt-3.5-turbo",
-            "agent2": "gpt-3.5-turbo",
+            "agent1": "gpt-4o-mini",
+            "agent2": "gpt-4o-mini",
         },
         sampler=UniformSampler(),
     )

diff --git a/sotopia/agents/generate_agent_background.py b/sotopia/agents/generate_agent_background.py
@@ -20,13 +20,13 @@ async def generate_background(
     else:
         initial_profile = str(basic_info)
         profile = await agenerate_init_profile(
-            model_name="gpt-3.5-turbo", basic_info=basic_info
+            model_name="gpt-4o-mini", basic_info=basic_info
         )
         first_narrative = convert_narratives(
-            model_name="gpt-3.5-turbo", narrative="first", text=profile
+            model_name="gpt-4o-mini", narrative="first", text=profile
         )
         second_narrative = convert_narratives(
-            model_name="gpt-3.5-turbo", narrative="second", text=profile
+            model_name="gpt-4o-mini", narrative="second", text=profile
         )
         previous_messages = []
     return (
@@ -67,8 +67,8 @@ def generate_background_conversation(
         json.dump(background_dict, f, indent=4)
 
     model_names: dict[str, str] = {
-        "env": "gpt-3.5-turbo",
-        "agent2": "gpt-3.5-turbo",
+        "env": "gpt-4o-mini",
+        "agent2": "gpt-4o-mini",
         "agent1": "gpt-4",
     }
 

diff --git a/sotopia/agents/llm_agent.py b/sotopia/agents/llm_agent.py
@@ -27,7 +27,7 @@ def __init__(
         agent_name: str | None = None,
         uuid_str: str | None = None,
         agent_profile: AgentProfile | None = None,
-        model_name: str = "gpt-3.5-turbo",
+        model_name: str = "gpt-4o-mini",
         script_like: bool = False,
     ) -> None:
         super().__init__(
@@ -99,7 +99,7 @@ def __init__(
         agent_name: str | None = None,
         uuid_str: str | None = None,
         agent_profile: AgentProfile | None = None,
-        model_name: str = "gpt-3.5-turbo",
+        model_name: str = "gpt-4o-mini",
         agent_names: list[str] = [],
         background: ScriptBackground | None = None,
     ) -> None:

diff --git a/sotopia/envs/parallel.py b/sotopia/envs/parallel.py
@@ -130,7 +130,7 @@ def __init__(
             ["none", "speak", "non-verbal communication", "action", "leave"]
         ),
         action_order: Literal["simultaneous", "round-robin", "random"] = "simultaneous",
-        model_name: str = "gpt-3.5-turbo",
+        model_name: str = "gpt-4o-mini",
         evaluators: list[Evaluator] = [],
         terminal_evaluators: list[Evaluator] = [],
         uuid_str: str | None = None,