forked from microsoft/aoai-api-simulator
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
ad282b7
commit 384a317
Showing
6 changed files
with
340 additions
and
39 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
#!/bin/bash | ||
set -e | ||
|
||
# | ||
# Runs a load test with no added latency and no limits | ||
# Used to validate the base latency of the simulator under load. | ||
# | ||
# The script runs a load test in Container Apps | ||
# and then runs follow-up steps to validate the results. | ||
# | ||
|
||
script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" | ||
|
||
# Use deployment with 10k TPM limit | ||
deployment_name="gpt-35-turbo-100k-token" | ||
|
||
# Set max tokens high to trigger rate-limiting by tokens not request count | ||
# Since there are 6 RPM per 1000 TPM a max_tokens of 1000/6 = 167 will trigger rate limiting | ||
# Keeping the value relatively makes the validation more granular | ||
max_tokens=200 | ||
|
||
result=$(\ | ||
LOCUST_USERS=30 \ | ||
LOCUST_RUN_TIME=3m \ | ||
LOCUST_SPAWN_RATE=2 \ | ||
TEST_FILE=./test_chat_completions_no_added_latency.py \ | ||
DEPLOYMENT_NAME=$deployment_name \ | ||
MAX_TOKENS=$max_tokens \ | ||
ALLOW_429_RESPONSES=true \ | ||
./scripts/_run-load-test-aca.sh) | ||
|
||
echo -e "________________\n$result" | ||
|
||
|
||
test_start_time=$(echo "$result" | jq -r '.start_time') | ||
test_stop_time=$(echo "$result" | jq -r '.end_time') | ||
|
||
echo "--test-start-time: '$test_start_time'" | ||
echo "--test-stop-time: '$test_stop_time'" | ||
echo "" | ||
echo "Running post steps" | ||
|
||
"$script_dir/_run-load-test-post-steps.sh" \ | ||
--test-start-time "$test_start_time" \ | ||
--test-stop-time "$test_stop_time" \ | ||
--filename ./src/loadtest/post_steps_limits_tokens.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
from dataclasses import dataclass | ||
from opentelemetry import metrics | ||
|
||
|
||
@dataclass | ||
class SimulatorMetrics: | ||
histogram_latency_base: metrics.Histogram | ||
histogram_latency_full: metrics.Histogram | ||
histogram_tokens_used: metrics.Histogram | ||
histogram_tokens_requested: metrics.Histogram | ||
histogram_rate_limit: metrics.Histogram | ||
|
||
|
||
def _get_simulator_metrics() -> SimulatorMetrics: | ||
meter = metrics.get_meter(__name__) | ||
return SimulatorMetrics( | ||
# dimensions: deployment, status_code | ||
histogram_latency_base=meter.create_histogram( | ||
name="aoai-simulator.latency.base", | ||
description="Latency of handling the request (before adding simulated latency)", | ||
unit="seconds", | ||
), | ||
# dimensions: deployment, status_code | ||
histogram_latency_full=meter.create_histogram( | ||
name="aoai-simulator.latency.full", | ||
description="Full latency of handling the request (including simulated latency)", | ||
unit="seconds", | ||
), | ||
# dimensions: deployment, token_type | ||
histogram_tokens_used=meter.create_histogram( | ||
name="aoai-simulator.tokens.used", | ||
description="Number of tokens used per request", | ||
unit="tokens", | ||
), | ||
# dimensions: deployment, token_type | ||
histogram_tokens_requested=meter.create_histogram( | ||
name="aoai-simulator.tokens.requested", | ||
description="Number of tokens across all requests (success or not)", | ||
unit="tokens", | ||
), | ||
# dimensions: deployment, reason | ||
histogram_rate_limit=meter.create_histogram( | ||
name="aoai-simulator.limits", | ||
description="Number of requests that were rate-limited", | ||
unit="requests", | ||
), | ||
) | ||
|
||
|
||
simulator_metrics = _get_simulator_metrics() |
Oops, something went wrong.