diff --git a/docs/developing.md b/docs/developing.md index 4535ab7..23571c2 100644 --- a/docs/developing.md +++ b/docs/developing.md @@ -65,3 +65,14 @@ This test uses 30 test users (i.e. ~30RPS) with a `max_tokens` value of 10 for e By keeping the `max_tokens` value low, we should trigger the request-based rate limiting rather than the token-based limiting. +#### Load test: Token limiting (no added latency, 100,000 tokens per minute) + +To run this test, run `./scripts/run-load-test-limits-tokens.sh`. + +The simulator endpoint used in this test is configured for 100,000 tokens per minute. +This equates to ~16,667 tokens per 10 second window. + +This test uses 30 test users (i.e. ~30RPS) with a `max_tokens` value of 200 for each request. +By keeping the `max_tokens` value high, we should trigger the token-based rate limiting rather than the request-based limiting. + +> NOTE: Every 1000 tokens per minute allows 6 requests per minute. Provided the `max_tokens` value used is greater than 1000/6 = 167, the rate-limiting should be triggered by tokens rather than requests. diff --git a/scripts/run-load-test-limits-tokens.sh b/scripts/run-load-test-limits-tokens.sh new file mode 100755 index 0000000..f86de31 --- /dev/null +++ b/scripts/run-load-test-limits-tokens.sh @@ -0,0 +1,46 @@ +#!/bin/bash +set -e + +# +# Runs a load test with no added latency and no limits +# Used to validate the base latency of the simulator under load. +# +# The script runs a load test in Container Apps +# and then runs follow-up steps to validate the results. +# + +script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + +# Use deployment with 10k TPM limit +deployment_name="gpt-35-turbo-100k-token" + +# Set max tokens high to trigger rate-limiting by tokens not request count +# Since there are 6 RPM per 1000 TPM a max_tokens of 1000/6 = 167 will trigger rate limiting +# Keeping the value relatively makes the validation more granular +max_tokens=200 + +result=$(\ + LOCUST_USERS=30 \ + LOCUST_RUN_TIME=3m \ + LOCUST_SPAWN_RATE=2 \ + TEST_FILE=./test_chat_completions_no_added_latency.py \ + DEPLOYMENT_NAME=$deployment_name \ + MAX_TOKENS=$max_tokens \ + ALLOW_429_RESPONSES=true \ + ./scripts/_run-load-test-aca.sh) + +echo -e "________________\n$result" + + +test_start_time=$(echo "$result" | jq -r '.start_time') +test_stop_time=$(echo "$result" | jq -r '.end_time') + +echo "--test-start-time: '$test_start_time'" +echo "--test-stop-time: '$test_stop_time'" +echo "" +echo "Running post steps" + +"$script_dir/_run-load-test-post-steps.sh" \ + --test-start-time "$test_start_time" \ + --test-stop-time "$test_stop_time" \ + --filename ./src/loadtest/post_steps_limits_tokens.py diff --git a/src/aoai-simulated-api/src/aoai_simulated_api/latency.py b/src/aoai-simulated-api/src/aoai_simulated_api/latency.py index 26e3d1a..0034529 100644 --- a/src/aoai-simulated-api/src/aoai_simulated_api/latency.py +++ b/src/aoai-simulated-api/src/aoai_simulated_api/latency.py @@ -1,48 +1,12 @@ import asyncio -from dataclasses import dataclass import time from fastapi import Response -from opentelemetry import trace, metrics from aoai_simulated_api import constants +from aoai_simulated_api.metrics import simulator_metrics from aoai_simulated_api.models import RequestContext -@dataclass -class SimulatorMetrics: - histogram_latency_base: metrics.Histogram - histogram_latency_full: metrics.Histogram - histogram_tokens_used: metrics.Histogram - histogram_tokens_requested: metrics.Histogram - - -def _get_simulator_metrics() -> SimulatorMetrics: - meter = metrics.get_meter(__name__) - return SimulatorMetrics( - histogram_latency_base=meter.create_histogram( - name="aoai-simulator.latency.base", - description="Latency of handling the request (before adding simulated latency)", - unit="seconds", - ), - histogram_latency_full=meter.create_histogram( - name="aoai-simulator.latency.full", - description="Full latency of handling the request (including simulated latency)", - unit="seconds", - ), - histogram_tokens_used=meter.create_histogram( - name="aoai-simulator.tokens.used", - description="Number of tokens used per request", - unit="tokens", - ), - histogram_tokens_requested=meter.create_histogram( - name="aoai-simulator.tokens.requested", - description="Number of tokens across all requests (success or not)", - unit="tokens", - ), - ) - - -simulator_metrics = _get_simulator_metrics() class LatencyGenerator: @@ -93,8 +57,6 @@ async def apply_latency(self): extra_latency_s = target_duration_s - base_duration_s if extra_latency_s and extra_latency_s > 0: - current_span = trace.get_current_span() - current_span.set_attribute("simulator.added_latency", extra_latency_s) await asyncio.sleep(extra_latency_s) full_end_time = time.perf_counter() diff --git a/src/aoai-simulated-api/src/aoai_simulated_api/limiters.py b/src/aoai-simulated-api/src/aoai_simulated_api/limiters.py index 7b06f3d..be61744 100644 --- a/src/aoai-simulated-api/src/aoai_simulated_api/limiters.py +++ b/src/aoai-simulated-api/src/aoai_simulated_api/limiters.py @@ -10,6 +10,7 @@ from aoai_simulated_api import constants +from aoai_simulated_api.metrics import simulator_metrics from aoai_simulated_api.models import Config, RequestContext logger = logging.getLogger(__name__) @@ -93,6 +94,13 @@ def limiter(context: RequestContext, response: Response) -> Response: + f"Please retry after {retry_after} seconds.", } } + simulator_metrics.histogram_rate_limit.record( + 1, + attributes={ + "deployment": deployment_name, + "reason": "requests_per_10s", + }, + ) return Response( status_code=429, content=json.dumps(content), @@ -109,6 +117,13 @@ def limiter(context: RequestContext, response: Response) -> Response: + f"Please retry after {retry_after} seconds.", } } + simulator_metrics.histogram_rate_limit.record( + 1, + attributes={ + "deployment": deployment_name, + "reason": "tokens_per_10s", + }, + ) return Response( status_code=429, content=json.dumps(content), diff --git a/src/aoai-simulated-api/src/aoai_simulated_api/metrics.py b/src/aoai-simulated-api/src/aoai_simulated_api/metrics.py new file mode 100644 index 0000000..1c3254f --- /dev/null +++ b/src/aoai-simulated-api/src/aoai_simulated_api/metrics.py @@ -0,0 +1,50 @@ +from dataclasses import dataclass +from opentelemetry import metrics + + +@dataclass +class SimulatorMetrics: + histogram_latency_base: metrics.Histogram + histogram_latency_full: metrics.Histogram + histogram_tokens_used: metrics.Histogram + histogram_tokens_requested: metrics.Histogram + histogram_rate_limit: metrics.Histogram + + +def _get_simulator_metrics() -> SimulatorMetrics: + meter = metrics.get_meter(__name__) + return SimulatorMetrics( + # dimensions: deployment, status_code + histogram_latency_base=meter.create_histogram( + name="aoai-simulator.latency.base", + description="Latency of handling the request (before adding simulated latency)", + unit="seconds", + ), + # dimensions: deployment, status_code + histogram_latency_full=meter.create_histogram( + name="aoai-simulator.latency.full", + description="Full latency of handling the request (including simulated latency)", + unit="seconds", + ), + # dimensions: deployment, token_type + histogram_tokens_used=meter.create_histogram( + name="aoai-simulator.tokens.used", + description="Number of tokens used per request", + unit="tokens", + ), + # dimensions: deployment, token_type + histogram_tokens_requested=meter.create_histogram( + name="aoai-simulator.tokens.requested", + description="Number of tokens across all requests (success or not)", + unit="tokens", + ), + # dimensions: deployment, reason + histogram_rate_limit=meter.create_histogram( + name="aoai-simulator.limits", + description="Number of requests that were rate-limited", + unit="requests", + ), + ) + + +simulator_metrics = _get_simulator_metrics() diff --git a/src/loadtest/post_steps_limits_tokens.py b/src/loadtest/post_steps_limits_tokens.py new file mode 100644 index 0000000..5adfbb1 --- /dev/null +++ b/src/loadtest/post_steps_limits_tokens.py @@ -0,0 +1,217 @@ +from datetime import UTC, datetime, timedelta +import os +import logging + +import asciichartpy as asciichart +from azure.identity import DefaultAzureCredential + +from common.config import ( + tenant_id, + subscription_id, + resource_group_name, + log_analytics_workspace_id, + log_analytics_workspace_name, +) + +from common.log_analytics import QueryProcessor, Table + +logging.basicConfig(level=logging.INFO) +logging.getLogger("azure").setLevel(logging.WARNING) + + +start_time_string = os.getenv("TEST_START_TIME") +stop_time_string = os.getenv("TEST_STOP_TIME") + +test_start_time = datetime.strptime(start_time_string, "%Y-%m-%dT%H:%M:%SZ") +test_stop_time = datetime.strptime(stop_time_string, "%Y-%m-%dT%H:%M:%SZ") + +print(f"test_start_time : {test_start_time}") +print(f"test_end_time : {test_stop_time}") + + +metric_check_time = test_stop_time - timedelta(seconds=40) # detecting the end of the test can take 30s, add 10s buffer + +query_processor = QueryProcessor( + workspace_id=log_analytics_workspace_id, + token_credential=DefaultAzureCredential(), + tenant_id=tenant_id, + subscription_id=subscription_id, + resource_group_name=resource_group_name, + workspace_name=log_analytics_workspace_name, +) + +print(f"metric_check_time: {metric_check_time}") +check_results_query = f""" +AppMetrics +| where TimeGenerated >= datetime({metric_check_time.strftime('%Y-%m-%dT%H:%M:%SZ')}) + and TimeGenerated <= datetime({test_stop_time.strftime('%Y-%m-%dT%H:%M:%SZ')}) + and Name == "locust.request_latency" +| count +""" +query_processor.wait_for_non_zero_count(check_results_query) + +timespan = (datetime.now(UTC) - timedelta(days=1), datetime.now(UTC)) + + +#################################################################### +# Ensure the base latency remains low with rate-limiting in place +# + + +def validate_request_latency(table: Table): + mean_latency = table.rows[0][0] + if mean_latency > 10: + return f"Mean latency is too high: {mean_latency}" + return None + + +query_processor.add_query( + title="Base Latency", + query=f""" +AppMetrics +| where TimeGenerated >= datetime({test_start_time.strftime('%Y-%m-%dT%H:%M:%SZ')}) + and TimeGenerated <= datetime({test_stop_time.strftime('%Y-%m-%dT%H:%M:%SZ')}) + and Name == "aoai-simulator.latency.base" +| summarize Sum=sum(Sum), Count = sum(ItemCount), Max=max(Max) +| project mean_latency_ms=1000*Sum/Count, max_latency_ms=1000*Max +""".strip(), + timespan=timespan, + show_query=True, + include_link=True, + validation_func=validate_request_latency, +) + + +#################################################################### +# Ensure the rate-limiting allows the expected tokens per 10s period +# + + +def validate_mean_tokens_used_per_10s(table: Table): + # Check if the mean RPS is within the expected range + # The deployment for the tests has 100,000 Tokens Per Minute (TPM) limit + # That equates to ~16,667 tokens per 10s period + mean_tokens_per_10s = table.rows[0][0] + if mean_tokens_per_10s > 17000: + return f"Mean tokens per 10s is too high: {mean_tokens_per_10s}" + if mean_tokens_per_10s < 16000: + return f"Mean tokens per 10s is too low: {mean_tokens_per_10s}" + return None + + +query_processor.add_query( + title="Mean Tokens Per 10s (successful requests)", + query=f""" +AppMetrics +| where TimeGenerated >= datetime({test_start_time.strftime('%Y-%m-%dT%H:%M:%SZ')}) + and TimeGenerated <= datetime({test_stop_time.strftime('%Y-%m-%dT%H:%M:%SZ')}) + and Name == "aoai-simulator.tokens.used" +| extend deployment = tostring(Properties["deployment"]) +| summarize total_token_count = sum(Sum) by bin(TimeGenerated, 10s) +| summarize avg_tokens_per_10s = avg(total_token_count) +""".strip(), + timespan=timespan, + show_query=True, + include_link=True, + validation_func=validate_mean_tokens_used_per_10s, +) + + +#################################################################### +# Ensure that we _do_ get 429 responses as expected +# + + +def validate_429_count(table: Table): + # With the level of user load targetting the deployment, we expect a high number of 429 responses + number_of_429_responses = table.rows[0][0] + if number_of_429_responses < 100: + return f"The number of 429 responses is too low: {number_of_429_responses}" + return None + + +query_processor.add_query( + title="Number of 429 responses (should be high)", + query=f""" +AppMetrics +| where TimeGenerated >= datetime({test_start_time.strftime('%Y-%m-%dT%H:%M:%SZ')}) + and TimeGenerated <= datetime({test_stop_time.strftime('%Y-%m-%dT%H:%M:%SZ')}) + and Name == "aoai-simulator.latency.base" +| extend status_code = Properties["status_code"] +| where status_code == 429 +| summarize ItemCount=sum(ItemCount) +""".strip(), + timespan=timespan, + show_query=True, + include_link=True, + validation_func=validate_429_count, +) + + +#################################################################### +# Show the RPS over time +# + +query_processor.add_query( + title="Tokens per 10s over time (Used - yellow, Requested - blue)", + query=f""" +AppMetrics +| where TimeGenerated >= datetime({test_start_time.strftime('%Y-%m-%dT%H:%M:%SZ')}) + and TimeGenerated <= datetime({test_stop_time.strftime('%Y-%m-%dT%H:%M:%SZ')}) + and (Name == "aoai-simulator.tokens.used" or Name == "aoai-simulator.tokens.requested") +| extend deployment = tostring(Properties["deployment"]) +| summarize total_token_count = sum(Sum) by bin(TimeGenerated, 10s), Name +| evaluate pivot(Name, sum(total_token_count)) +| render timechart | render timechart +""".strip(), + is_chart=True, + columns=["aoai-simulator.tokens.used", "aoai-simulator.tokens.requested"], + chart_config={ + "height": 15, + "min": 0, + "colors": [ + asciichart.yellow, + asciichart.blue, + ], + }, + timespan=timespan, + show_query=True, + include_link=True, +) + + +#################################################################### +# Show the RPS over time +# + +query_processor.add_query( + title="Latency (base) over time in ms (mean - yellow, max - blue)", + query=f""" +AppMetrics +| where TimeGenerated >= datetime({test_start_time.strftime('%Y-%m-%dT%H:%M:%SZ')}) + and TimeGenerated <= datetime({test_stop_time.strftime('%Y-%m-%dT%H:%M:%SZ')}) + and Name == "aoai-simulator.latency.base" +| summarize Sum=sum(Sum), Count = sum(ItemCount), Max=max(Max) by bin(TimeGenerated, 10s) +| project TimeGenerated, mean_latency_ms=1000*Sum/Count, max_latency_ms=1000*Max +| render timechart +""".strip(), + is_chart=True, + columns=["mean_latency_ms", "max_latency_ms"], + chart_config={ + "height": 15, + "min": 0, + "colors": [ + asciichart.yellow, + asciichart.blue, + ], + }, + timespan=timespan, + show_query=True, + include_link=True, +) + + +query_errors = query_processor.run_queries() + +if query_errors: + exit(1)