Error handling for incompatible model (microsoft#58)

* Error handling for incompatible model
mluker · Sep 20, 2024 · 7ecf419 · 7ecf419
1 parent df1ae48
commit 7ecf419
Show file tree

Hide file tree

Showing 6 changed files with 179 additions and 63 deletions.
diff --git a/examples/generator_replace_chat_completion/generator_config.py b/examples/generator_replace_chat_completion/generator_config.py
@@ -4,12 +4,12 @@
 import json
 
 from aoai_api_simulator.auth import validate_api_key_header
-from aoai_api_simulator.models import Config, RequestContext
 from aoai_api_simulator.generator.openai import (
     calculate_latency,
     create_lorem_chat_completion_response,
-    get_model_name_from_deployment_name,
+    get_chat_model_from_deployment_name,
 )
+from aoai_api_simulator.models import Config, RequestContext
 from fastapi import Response
 
 
@@ -49,8 +49,8 @@ async def custom_azure_openai_chat_completion(context: RequestContext) -> Respon
 
     request_body = await request.json()
     deployment_name = path_params["deployment"]
-    model_name = get_model_name_from_deployment_name(context, deployment_name)
-    if model_name is None:
+    model = get_chat_model_from_deployment_name(context, deployment_name)
+    if model is None:
         return Response(
             status_code=404,
             content=json.dumps({"error": f"Deployment {deployment_name} not found"}),
@@ -70,7 +70,7 @@ async def custom_azure_openai_chat_completion(context: RequestContext) -> Respon
     response = create_lorem_chat_completion_response(
         context=context,
         deployment_name=deployment_name,
-        model_name=model_name,
+        model_name=model.name,
         streaming=streaming,
         max_tokens=max_tokens,
         prompt_messages=messages,

diff --git a/src/aoai-api-simulator/src/aoai_api_simulator/generator/model_catalogue.py b/src/aoai-api-simulator/src/aoai_api_simulator/generator/model_catalogue.py
@@ -2,6 +2,7 @@
 
 model_catalogue = {
     "gpt-3.5-turbo": OpenAIChatModel(name="gpt-3.5-turbo"),
+    "gpt-3.5-turbo-0613": OpenAIChatModel(name="gpt-3.5-turbo-0613"),
     "text-embedding-ada-001": OpenAIEmbeddingModel(name="text-embedding-ada-001", supports_custom_dimensions=False),
     "text-embedding-ada-002": OpenAIEmbeddingModel(name="text-embedding-ada-002", supports_custom_dimensions=False),
     "text-embedding-3-small": OpenAIEmbeddingModel(name="text-embedding-3-small", supports_custom_dimensions=True),

diff --git a/src/aoai-api-simulator/src/aoai_api_simulator/generator/openai.py b/src/aoai-api-simulator/src/aoai_api_simulator/generator/openai.py
@@ -24,7 +24,7 @@
     num_tokens_from_messages,
     num_tokens_from_string,
 )
-from aoai_api_simulator.models import OpenAIDeployment, OpenAIEmbeddingModel, RequestContext
+from aoai_api_simulator.models import OpenAIChatModel, OpenAIDeployment, OpenAIEmbeddingModel, RequestContext
 from fastapi import Response
 from fastapi.responses import StreamingResponse
 
@@ -44,7 +44,7 @@
 )
 
 
-def get_embedding_model_from_deployment_name(context: RequestContext, deployment_name: str) -> OpenAIDeployment | None:
+def get_embedding_deployment_from_name(context: RequestContext, deployment_name: str) -> OpenAIDeployment | None:
     """
     Gets the embedding model for the specified embedding deployment.
     If the deployment is not in the configured deployments,
@@ -91,7 +91,7 @@ def get_embedding_model_from_deployment_name(context: RequestContext, deployment
     return None
 
 
-def get_model_name_from_deployment_name(context: RequestContext, deployment_name: str) -> str | None:
+def get_chat_model_from_deployment_name(context: RequestContext, deployment_name: str) -> OpenAIChatModel | None:
     """
     Gets the model name for the specified deployment.
     If the deployment is not in the configured deployments then either a default model is returned (if )
@@ -100,7 +100,7 @@ def get_model_name_from_deployment_name(context: RequestContext, deployment_name
     if deployments:
         deployment = deployments.get(deployment_name)
         if deployment:
-            return deployment.model.name
+            return deployment.model
 
     if context.config.allow_undefined_openai_deployments:
         default_model = "gpt-3.5-turbo-0613"
@@ -114,7 +114,7 @@ def get_model_name_from_deployment_name(context: RequestContext, deployment_name
                 default_model,
             )
             deployment_missing_warning_printed.add(deployment_name)
-        return default_model
+        return model_catalogue[default_model]
 
     # Output warning for missing deployment name (only the first time we encounter it)
     if deployment_name not in deployment_missing_warning_printed:
@@ -459,7 +459,7 @@ async def azure_openai_embedding(context: RequestContext) -> Response | None:
     _validate_api_key_header(context)
     deployment_name = path_params["deployment"]
     request_body = await request.json()
-    deployment = get_embedding_model_from_deployment_name(context, deployment_name)
+    deployment = get_embedding_deployment_from_name(context, deployment_name)
 
     if deployment is None:
         return Response(
@@ -478,7 +478,8 @@ async def azure_openai_embedding(context: RequestContext) -> Response | None:
                     "error": {
                         "code": "OperationNotSupported",
                         "message": f"The embeddings operation does not work with the specified model, {deployment_name}. "
-                        + "Please choose different model and try again.",
+                        + "Please choose different model and try again. "
+                        + "You can learn more about which models can be used with each operation here: https://go.microsoft.com/fwlink/?linkid=2197993.",
                     }
                 }
             ),
@@ -514,27 +515,45 @@ async def azure_openai_completion(context: RequestContext) -> Response | None:
     _validate_api_key_header(context)
 
     deployment_name = path_params["deployment"]
-    model_name = get_model_name_from_deployment_name(context, deployment_name)
-    if model_name is None:
+    model = get_chat_model_from_deployment_name(context, deployment_name)
+    if model is None:
         return Response(
             status_code=404,
             content=json.dumps({"error": f"Deployment {deployment_name} not found"}),
             headers={
                 "Content-Type": "application/json",
             },
         )
+
+    if not isinstance(model, OpenAIChatModel):
+        return Response(
+            status_code=400,
+            content=json.dumps(
+                {
+                    "error": {
+                        "code": "OperationNotSupported",
+                        "message": f"The completions operation does not work with the specified model, {deployment_name}. "
+                        + "Please choose different model and try again. "
+                        + "You can learn more about which models can be used with each operation here: https://go.microsoft.com/fwlink/?linkid=2197993.",
+                    }
+                }
+            ),
+            headers={
+                "Content-Type": "application/json",
+            },
+        )
     request_body = await request.json()
-    prompt_tokens = num_tokens_from_string(request_body["prompt"], model_name)
+    prompt_tokens = num_tokens_from_string(request_body["prompt"], model.name)
 
-    requested_max_tokens, max_tokens = get_max_completion_tokens(request_body, model_name, prompt_tokens=prompt_tokens)
+    requested_max_tokens, max_tokens = get_max_completion_tokens(request_body, model.name, prompt_tokens=prompt_tokens)
 
     context.values[SIMULATOR_KEY_OPENAI_MAX_TOKENS_REQUESTED] = requested_max_tokens
     context.values[SIMULATOR_KEY_OPENAI_MAX_TOKENS_EFFECTIVE] = max_tokens
 
     response = create_completion_response(
         context=context,
         deployment_name=deployment_name,
-        model_name=model_name,
+        model_name=model.name,
         prompt_tokens=prompt_tokens,
         max_tokens=max_tokens,
     )
@@ -558,19 +577,37 @@ async def azure_openai_chat_completion(context: RequestContext) -> Response | No
 
     request_body = await request.json()
     deployment_name = path_params["deployment"]
-    model_name = get_model_name_from_deployment_name(context, deployment_name)
-    if model_name is None:
+    model = get_chat_model_from_deployment_name(context, deployment_name)
+    if model is None:
         return Response(
             status_code=404,
             content=json.dumps({"error": f"Deployment {deployment_name} not found"}),
             headers={
                 "Content-Type": "application/json",
             },
         )
+    if not isinstance(model, OpenAIChatModel):
+        return Response(
+            status_code=400,
+            content=json.dumps(
+                {
+                    "error": {
+                        "code": "OperationNotSupported",
+                        "message": f"The chatCompletion operation does not work with the specified model, {deployment_name}. "
+                        + "Please choose different model and try again. "
+                        + "You can learn more about which models can be used with each operation here: https://go.microsoft.com/fwlink/?linkid=2197993.",
+                    }
+                }
+            ),
+            headers={
+                "Content-Type": "application/json",
+            },
+        )
+
     messages = request_body["messages"]
-    prompt_tokens = num_tokens_from_messages(messages, model_name)
+    prompt_tokens = num_tokens_from_messages(messages, model.name)
 
-    requested_max_tokens, max_tokens = get_max_completion_tokens(request_body, model_name, prompt_tokens=prompt_tokens)
+    requested_max_tokens, max_tokens = get_max_completion_tokens(request_body, model.name, prompt_tokens=prompt_tokens)
 
     context.values[SIMULATOR_KEY_OPENAI_MAX_TOKENS_REQUESTED] = requested_max_tokens
     context.values[SIMULATOR_KEY_OPENAI_MAX_TOKENS_EFFECTIVE] = max_tokens
@@ -580,7 +617,7 @@ async def azure_openai_chat_completion(context: RequestContext) -> Response | No
     response = create_lorem_chat_completion_response(
         context=context,
         deployment_name=deployment_name,
-        model_name=model_name,
+        model_name=model.name,
         streaming=streaming,
         max_tokens=max_tokens,
         prompt_messages=messages,

diff --git a/tests/test_openai_generator_chat_completion.py b/tests/test_openai_generator_chat_completion.py
@@ -2,22 +2,24 @@
 Test the OpenAI generator endpoints
 """
 
+import pytest
+from aoai_api_simulator.generator.manager import get_default_generators
+from aoai_api_simulator.generator.model_catalogue import model_catalogue
 from aoai_api_simulator.models import (
-    Config,
-    LatencyConfig,
     ChatCompletionLatency,
     CompletionLatency,
+    Config,
     EmbeddingLatency,
+    LatencyConfig,
     OpenAIDeployment,
 )
-from aoai_api_simulator.generator.manager import get_default_generators
-from openai import AzureOpenAI, AuthenticationError, NotFoundError, RateLimitError, Stream
+from openai import AuthenticationError, AzureOpenAI, BadRequestError, NotFoundError, Stream
 from openai.types.chat import ChatCompletionChunk
-import pytest
 
 from .test_uvicorn_server import UvicornTestServer
 
 API_KEY = "123456789"
+ENDPOINT = "http://localhost:8001"
 
 
 def _get_generator_config(extension_path: str | None = None) -> Config:
@@ -39,7 +41,20 @@ def _get_generator_config(extension_path: str | None = None) -> Config:
         ),
     )
     config.openai_deployments = {
-        "low_limit": OpenAIDeployment(name="low_limit", model="gpt-3.5-turbo", tokens_per_minute=64 * 6)
+        "low_limit": OpenAIDeployment(
+            name="low_limit", model=model_catalogue["gpt-3.5-turbo"], tokens_per_minute=64 * 6
+        ),
+        "deployment1": OpenAIDeployment(
+            name="deployment1",
+            model=model_catalogue["text-embedding-ada-002"],
+            embedding_size=1536,
+            tokens_per_minute=10000,
+        ),
+        "gpt-3.5-10m": OpenAIDeployment(
+            name="gpt-3.5-10m,",
+            model=model_catalogue["gpt-3.5-turbo"],
+            tokens_per_minute=10000000,
+        ),
     }
     config.extension_path = extension_path
     return config
@@ -56,7 +71,7 @@ async def test_requires_auth():
         aoai_client = AzureOpenAI(
             api_key="wrong_key",
             api_version="2023-12-01-preview",
-            azure_endpoint="http://localhost:8001",
+            azure_endpoint=ENDPOINT,
             max_retries=0,
         )
         messages = [{"role": "user", "content": "What is the meaning of life?"}]
@@ -79,12 +94,12 @@ async def test_success():
         aoai_client = AzureOpenAI(
             api_key=API_KEY,
             api_version="2023-12-01-preview",
-            azure_endpoint="http://localhost:8001",
+            azure_endpoint=ENDPOINT,
             max_retries=0,
         )
         messages = [{"role": "user", "content": "What is the meaning of life?"}]
         max_tokens = 50
-        response = aoai_client.chat.completions.create(model="deployment1", messages=messages, max_tokens=max_tokens)
+        response = aoai_client.chat.completions.create(model="low_limit", messages=messages, max_tokens=max_tokens)
 
         assert len(response.choices) == 1
         assert response.choices[0].message.role == "assistant"
@@ -105,7 +120,7 @@ async def test_requires_known_deployment_when_config_set():
         aoai_client = AzureOpenAI(
             api_key=API_KEY,
             api_version="2023-12-01-preview",
-            azure_endpoint="http://localhost:8001",
+            azure_endpoint=ENDPOINT,
             max_retries=0,
         )
         messages = [{"role": "user", "content": "What is the meaning of life?"}]
@@ -130,7 +145,7 @@ async def test_allows_unknown_deployment_when_config_not_set():
         aoai_client = AzureOpenAI(
             api_key=API_KEY,
             api_version="2023-12-01-preview",
-            azure_endpoint="http://localhost:8001",
+            azure_endpoint=ENDPOINT,
             max_retries=0,
         )
         messages = [{"role": "user", "content": "What is the meaning of life?"}]
@@ -154,7 +169,7 @@ async def test_max_tokens():
         aoai_client = AzureOpenAI(
             api_key=API_KEY,
             api_version="2023-12-01-preview",
-            azure_endpoint="http://localhost:8001",
+            azure_endpoint=ENDPOINT,
             max_retries=0,
         )
         messages = [{"role": "user", "content": "What is the meaning of life?"}]
@@ -163,7 +178,7 @@ async def test_max_tokens():
         # Make repeated requests to ensure that none exceed max_tokens
         for _ in range(1000):
             response = aoai_client.chat.completions.create(
-                model="deployment1", messages=messages, max_tokens=max_tokens
+                model="gpt-3.5-10m", messages=messages, max_tokens=max_tokens
             )
             assert response.usage.completion_tokens <= max_tokens
 
@@ -179,12 +194,12 @@ async def test_stream_success():
         aoai_client = AzureOpenAI(
             api_key=API_KEY,
             api_version="2023-12-01-preview",
-            azure_endpoint="http://localhost:8001",
+            azure_endpoint=ENDPOINT,
             max_retries=0,
         )
         messages = [{"role": "user", "content": "What is the meaning of life?"}]
         response: Stream[ChatCompletionChunk] = aoai_client.chat.completions.create(
-            model="deployment1", messages=messages, max_tokens=50, stream=True
+            model="low_limit", messages=messages, max_tokens=50, stream=True
         )
 
         is_first_chunk = True
@@ -213,13 +228,39 @@ async def test_custom_generator():
         aoai_client = AzureOpenAI(
             api_key=API_KEY,
             api_version="2023-12-01-preview",
-            azure_endpoint="http://localhost:8001",
+            azure_endpoint=ENDPOINT,
             max_retries=0,
         )
         messages = [{"role": "user", "content": "What is the meaning of life?"}]
-        response = aoai_client.chat.completions.create(model="deployment1", messages=messages, max_tokens=50)
+        response = aoai_client.chat.completions.create(model="low_limit", messages=messages, max_tokens=50)
 
         assert len(response.choices) == 1
         assert response.choices[0].message.role == "assistant"
         assert response.usage.completion_tokens <= 10, "Custom generator hard-codes max_tokens to 10"
         assert response.choices[0].finish_reason == "stop"
+
+
+@pytest.mark.asyncio
+async def test_using_unsupported_model_for_completions_returns_400():
+    """
+    Test that passing in an unsupported model name to chat completion generation
+    fails with 400 Bad Request
+    """
+    config = _get_generator_config()
+    server = UvicornTestServer(config)
+    with server.run_in_thread():
+        aoai_client = AzureOpenAI(
+            api_key=API_KEY,
+            api_version="2023-12-01-preview",
+            azure_endpoint=ENDPOINT,
+            max_retries=0,
+        )
+        with pytest.raises(BadRequestError) as e:
+            messages = [{"role": "user", "content": "What is the meaning of life?"}]
+            aoai_client.chat.completions.create(model="deployment1", messages=messages, max_tokens=50)
+
+        assert e.value.status_code == 400
+        assert (
+            e.value.message
+            == "Error code: 400 - {'error': {'code': 'OperationNotSupported', 'message': 'The chatCompletion operation does not work with the specified model, deployment1. Please choose different model and try again. You can learn more about which models can be used with each operation here: https://go.microsoft.com/fwlink/?linkid=2197993.'}}"
+        )