Skip to content

Commit

Permalink
Error handling for incompatible model (microsoft#58)
Browse files Browse the repository at this point in the history
* Error handling for incompatible model
  • Loading branch information
tanya-borisova authored Sep 20, 2024
1 parent df1ae48 commit 7ecf419
Show file tree
Hide file tree
Showing 6 changed files with 179 additions and 63 deletions.
10 changes: 5 additions & 5 deletions examples/generator_replace_chat_completion/generator_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
import json

from aoai_api_simulator.auth import validate_api_key_header
from aoai_api_simulator.models import Config, RequestContext
from aoai_api_simulator.generator.openai import (
calculate_latency,
create_lorem_chat_completion_response,
get_model_name_from_deployment_name,
get_chat_model_from_deployment_name,
)
from aoai_api_simulator.models import Config, RequestContext
from fastapi import Response


Expand Down Expand Up @@ -49,8 +49,8 @@ async def custom_azure_openai_chat_completion(context: RequestContext) -> Respon

request_body = await request.json()
deployment_name = path_params["deployment"]
model_name = get_model_name_from_deployment_name(context, deployment_name)
if model_name is None:
model = get_chat_model_from_deployment_name(context, deployment_name)
if model is None:
return Response(
status_code=404,
content=json.dumps({"error": f"Deployment {deployment_name} not found"}),
Expand All @@ -70,7 +70,7 @@ async def custom_azure_openai_chat_completion(context: RequestContext) -> Respon
response = create_lorem_chat_completion_response(
context=context,
deployment_name=deployment_name,
model_name=model_name,
model_name=model.name,
streaming=streaming,
max_tokens=max_tokens,
prompt_messages=messages,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

model_catalogue = {
"gpt-3.5-turbo": OpenAIChatModel(name="gpt-3.5-turbo"),
"gpt-3.5-turbo-0613": OpenAIChatModel(name="gpt-3.5-turbo-0613"),
"text-embedding-ada-001": OpenAIEmbeddingModel(name="text-embedding-ada-001", supports_custom_dimensions=False),
"text-embedding-ada-002": OpenAIEmbeddingModel(name="text-embedding-ada-002", supports_custom_dimensions=False),
"text-embedding-3-small": OpenAIEmbeddingModel(name="text-embedding-3-small", supports_custom_dimensions=True),
Expand Down
71 changes: 54 additions & 17 deletions src/aoai-api-simulator/src/aoai_api_simulator/generator/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
num_tokens_from_messages,
num_tokens_from_string,
)
from aoai_api_simulator.models import OpenAIDeployment, OpenAIEmbeddingModel, RequestContext
from aoai_api_simulator.models import OpenAIChatModel, OpenAIDeployment, OpenAIEmbeddingModel, RequestContext
from fastapi import Response
from fastapi.responses import StreamingResponse

Expand All @@ -44,7 +44,7 @@
)


def get_embedding_model_from_deployment_name(context: RequestContext, deployment_name: str) -> OpenAIDeployment | None:
def get_embedding_deployment_from_name(context: RequestContext, deployment_name: str) -> OpenAIDeployment | None:
"""
Gets the embedding model for the specified embedding deployment.
If the deployment is not in the configured deployments,
Expand Down Expand Up @@ -91,7 +91,7 @@ def get_embedding_model_from_deployment_name(context: RequestContext, deployment
return None


def get_model_name_from_deployment_name(context: RequestContext, deployment_name: str) -> str | None:
def get_chat_model_from_deployment_name(context: RequestContext, deployment_name: str) -> OpenAIChatModel | None:
"""
Gets the model name for the specified deployment.
If the deployment is not in the configured deployments then either a default model is returned (if )
Expand All @@ -100,7 +100,7 @@ def get_model_name_from_deployment_name(context: RequestContext, deployment_name
if deployments:
deployment = deployments.get(deployment_name)
if deployment:
return deployment.model.name
return deployment.model

if context.config.allow_undefined_openai_deployments:
default_model = "gpt-3.5-turbo-0613"
Expand All @@ -114,7 +114,7 @@ def get_model_name_from_deployment_name(context: RequestContext, deployment_name
default_model,
)
deployment_missing_warning_printed.add(deployment_name)
return default_model
return model_catalogue[default_model]

# Output warning for missing deployment name (only the first time we encounter it)
if deployment_name not in deployment_missing_warning_printed:
Expand Down Expand Up @@ -459,7 +459,7 @@ async def azure_openai_embedding(context: RequestContext) -> Response | None:
_validate_api_key_header(context)
deployment_name = path_params["deployment"]
request_body = await request.json()
deployment = get_embedding_model_from_deployment_name(context, deployment_name)
deployment = get_embedding_deployment_from_name(context, deployment_name)

if deployment is None:
return Response(
Expand All @@ -478,7 +478,8 @@ async def azure_openai_embedding(context: RequestContext) -> Response | None:
"error": {
"code": "OperationNotSupported",
"message": f"The embeddings operation does not work with the specified model, {deployment_name}. "
+ "Please choose different model and try again.",
+ "Please choose different model and try again. "
+ "You can learn more about which models can be used with each operation here: https://go.microsoft.com/fwlink/?linkid=2197993.",
}
}
),
Expand Down Expand Up @@ -514,27 +515,45 @@ async def azure_openai_completion(context: RequestContext) -> Response | None:
_validate_api_key_header(context)

deployment_name = path_params["deployment"]
model_name = get_model_name_from_deployment_name(context, deployment_name)
if model_name is None:
model = get_chat_model_from_deployment_name(context, deployment_name)
if model is None:
return Response(
status_code=404,
content=json.dumps({"error": f"Deployment {deployment_name} not found"}),
headers={
"Content-Type": "application/json",
},
)

if not isinstance(model, OpenAIChatModel):
return Response(
status_code=400,
content=json.dumps(
{
"error": {
"code": "OperationNotSupported",
"message": f"The completions operation does not work with the specified model, {deployment_name}. "
+ "Please choose different model and try again. "
+ "You can learn more about which models can be used with each operation here: https://go.microsoft.com/fwlink/?linkid=2197993.",
}
}
),
headers={
"Content-Type": "application/json",
},
)
request_body = await request.json()
prompt_tokens = num_tokens_from_string(request_body["prompt"], model_name)
prompt_tokens = num_tokens_from_string(request_body["prompt"], model.name)

requested_max_tokens, max_tokens = get_max_completion_tokens(request_body, model_name, prompt_tokens=prompt_tokens)
requested_max_tokens, max_tokens = get_max_completion_tokens(request_body, model.name, prompt_tokens=prompt_tokens)

context.values[SIMULATOR_KEY_OPENAI_MAX_TOKENS_REQUESTED] = requested_max_tokens
context.values[SIMULATOR_KEY_OPENAI_MAX_TOKENS_EFFECTIVE] = max_tokens

response = create_completion_response(
context=context,
deployment_name=deployment_name,
model_name=model_name,
model_name=model.name,
prompt_tokens=prompt_tokens,
max_tokens=max_tokens,
)
Expand All @@ -558,19 +577,37 @@ async def azure_openai_chat_completion(context: RequestContext) -> Response | No

request_body = await request.json()
deployment_name = path_params["deployment"]
model_name = get_model_name_from_deployment_name(context, deployment_name)
if model_name is None:
model = get_chat_model_from_deployment_name(context, deployment_name)
if model is None:
return Response(
status_code=404,
content=json.dumps({"error": f"Deployment {deployment_name} not found"}),
headers={
"Content-Type": "application/json",
},
)
if not isinstance(model, OpenAIChatModel):
return Response(
status_code=400,
content=json.dumps(
{
"error": {
"code": "OperationNotSupported",
"message": f"The chatCompletion operation does not work with the specified model, {deployment_name}. "
+ "Please choose different model and try again. "
+ "You can learn more about which models can be used with each operation here: https://go.microsoft.com/fwlink/?linkid=2197993.",
}
}
),
headers={
"Content-Type": "application/json",
},
)

messages = request_body["messages"]
prompt_tokens = num_tokens_from_messages(messages, model_name)
prompt_tokens = num_tokens_from_messages(messages, model.name)

requested_max_tokens, max_tokens = get_max_completion_tokens(request_body, model_name, prompt_tokens=prompt_tokens)
requested_max_tokens, max_tokens = get_max_completion_tokens(request_body, model.name, prompt_tokens=prompt_tokens)

context.values[SIMULATOR_KEY_OPENAI_MAX_TOKENS_REQUESTED] = requested_max_tokens
context.values[SIMULATOR_KEY_OPENAI_MAX_TOKENS_EFFECTIVE] = max_tokens
Expand All @@ -580,7 +617,7 @@ async def azure_openai_chat_completion(context: RequestContext) -> Response | No
response = create_lorem_chat_completion_response(
context=context,
deployment_name=deployment_name,
model_name=model_name,
model_name=model.name,
streaming=streaming,
max_tokens=max_tokens,
prompt_messages=messages,
Expand Down
75 changes: 58 additions & 17 deletions tests/test_openai_generator_chat_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,24 @@
Test the OpenAI generator endpoints
"""

import pytest
from aoai_api_simulator.generator.manager import get_default_generators
from aoai_api_simulator.generator.model_catalogue import model_catalogue
from aoai_api_simulator.models import (
Config,
LatencyConfig,
ChatCompletionLatency,
CompletionLatency,
Config,
EmbeddingLatency,
LatencyConfig,
OpenAIDeployment,
)
from aoai_api_simulator.generator.manager import get_default_generators
from openai import AzureOpenAI, AuthenticationError, NotFoundError, RateLimitError, Stream
from openai import AuthenticationError, AzureOpenAI, BadRequestError, NotFoundError, Stream
from openai.types.chat import ChatCompletionChunk
import pytest

from .test_uvicorn_server import UvicornTestServer

API_KEY = "123456789"
ENDPOINT = "http://localhost:8001"


def _get_generator_config(extension_path: str | None = None) -> Config:
Expand All @@ -39,7 +41,20 @@ def _get_generator_config(extension_path: str | None = None) -> Config:
),
)
config.openai_deployments = {
"low_limit": OpenAIDeployment(name="low_limit", model="gpt-3.5-turbo", tokens_per_minute=64 * 6)
"low_limit": OpenAIDeployment(
name="low_limit", model=model_catalogue["gpt-3.5-turbo"], tokens_per_minute=64 * 6
),
"deployment1": OpenAIDeployment(
name="deployment1",
model=model_catalogue["text-embedding-ada-002"],
embedding_size=1536,
tokens_per_minute=10000,
),
"gpt-3.5-10m": OpenAIDeployment(
name="gpt-3.5-10m,",
model=model_catalogue["gpt-3.5-turbo"],
tokens_per_minute=10000000,
),
}
config.extension_path = extension_path
return config
Expand All @@ -56,7 +71,7 @@ async def test_requires_auth():
aoai_client = AzureOpenAI(
api_key="wrong_key",
api_version="2023-12-01-preview",
azure_endpoint="http://localhost:8001",
azure_endpoint=ENDPOINT,
max_retries=0,
)
messages = [{"role": "user", "content": "What is the meaning of life?"}]
Expand All @@ -79,12 +94,12 @@ async def test_success():
aoai_client = AzureOpenAI(
api_key=API_KEY,
api_version="2023-12-01-preview",
azure_endpoint="http://localhost:8001",
azure_endpoint=ENDPOINT,
max_retries=0,
)
messages = [{"role": "user", "content": "What is the meaning of life?"}]
max_tokens = 50
response = aoai_client.chat.completions.create(model="deployment1", messages=messages, max_tokens=max_tokens)
response = aoai_client.chat.completions.create(model="low_limit", messages=messages, max_tokens=max_tokens)

assert len(response.choices) == 1
assert response.choices[0].message.role == "assistant"
Expand All @@ -105,7 +120,7 @@ async def test_requires_known_deployment_when_config_set():
aoai_client = AzureOpenAI(
api_key=API_KEY,
api_version="2023-12-01-preview",
azure_endpoint="http://localhost:8001",
azure_endpoint=ENDPOINT,
max_retries=0,
)
messages = [{"role": "user", "content": "What is the meaning of life?"}]
Expand All @@ -130,7 +145,7 @@ async def test_allows_unknown_deployment_when_config_not_set():
aoai_client = AzureOpenAI(
api_key=API_KEY,
api_version="2023-12-01-preview",
azure_endpoint="http://localhost:8001",
azure_endpoint=ENDPOINT,
max_retries=0,
)
messages = [{"role": "user", "content": "What is the meaning of life?"}]
Expand All @@ -154,7 +169,7 @@ async def test_max_tokens():
aoai_client = AzureOpenAI(
api_key=API_KEY,
api_version="2023-12-01-preview",
azure_endpoint="http://localhost:8001",
azure_endpoint=ENDPOINT,
max_retries=0,
)
messages = [{"role": "user", "content": "What is the meaning of life?"}]
Expand All @@ -163,7 +178,7 @@ async def test_max_tokens():
# Make repeated requests to ensure that none exceed max_tokens
for _ in range(1000):
response = aoai_client.chat.completions.create(
model="deployment1", messages=messages, max_tokens=max_tokens
model="gpt-3.5-10m", messages=messages, max_tokens=max_tokens
)
assert response.usage.completion_tokens <= max_tokens

Expand All @@ -179,12 +194,12 @@ async def test_stream_success():
aoai_client = AzureOpenAI(
api_key=API_KEY,
api_version="2023-12-01-preview",
azure_endpoint="http://localhost:8001",
azure_endpoint=ENDPOINT,
max_retries=0,
)
messages = [{"role": "user", "content": "What is the meaning of life?"}]
response: Stream[ChatCompletionChunk] = aoai_client.chat.completions.create(
model="deployment1", messages=messages, max_tokens=50, stream=True
model="low_limit", messages=messages, max_tokens=50, stream=True
)

is_first_chunk = True
Expand Down Expand Up @@ -213,13 +228,39 @@ async def test_custom_generator():
aoai_client = AzureOpenAI(
api_key=API_KEY,
api_version="2023-12-01-preview",
azure_endpoint="http://localhost:8001",
azure_endpoint=ENDPOINT,
max_retries=0,
)
messages = [{"role": "user", "content": "What is the meaning of life?"}]
response = aoai_client.chat.completions.create(model="deployment1", messages=messages, max_tokens=50)
response = aoai_client.chat.completions.create(model="low_limit", messages=messages, max_tokens=50)

assert len(response.choices) == 1
assert response.choices[0].message.role == "assistant"
assert response.usage.completion_tokens <= 10, "Custom generator hard-codes max_tokens to 10"
assert response.choices[0].finish_reason == "stop"


@pytest.mark.asyncio
async def test_using_unsupported_model_for_completions_returns_400():
"""
Test that passing in an unsupported model name to chat completion generation
fails with 400 Bad Request
"""
config = _get_generator_config()
server = UvicornTestServer(config)
with server.run_in_thread():
aoai_client = AzureOpenAI(
api_key=API_KEY,
api_version="2023-12-01-preview",
azure_endpoint=ENDPOINT,
max_retries=0,
)
with pytest.raises(BadRequestError) as e:
messages = [{"role": "user", "content": "What is the meaning of life?"}]
aoai_client.chat.completions.create(model="deployment1", messages=messages, max_tokens=50)

assert e.value.status_code == 400
assert (
e.value.message
== "Error code: 400 - {'error': {'code': 'OperationNotSupported', 'message': 'The chatCompletion operation does not work with the specified model, deployment1. Please choose different model and try again. You can learn more about which models can be used with each operation here: https://go.microsoft.com/fwlink/?linkid=2197993.'}}"
)
Loading

0 comments on commit 7ecf419

Please sign in to comment.