Skip to content

Commit

Permalink
Add token-based limiting load test
Browse files Browse the repository at this point in the history
  • Loading branch information
stuartleeks committed Jun 24, 2024
1 parent ad282b7 commit 384a317
Show file tree
Hide file tree
Showing 6 changed files with 340 additions and 39 deletions.
11 changes: 11 additions & 0 deletions docs/developing.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,14 @@ This test uses 30 test users (i.e. ~30RPS) with a `max_tokens` value of 10 for e
By keeping the `max_tokens` value low, we should trigger the request-based rate limiting rather than the token-based limiting.


#### Load test: Token limiting (no added latency, 100,000 tokens per minute)

To run this test, run `./scripts/run-load-test-limits-tokens.sh`.

The simulator endpoint used in this test is configured for 100,000 tokens per minute.
This equates to ~16,667 tokens per 10 second window.

This test uses 30 test users (i.e. ~30RPS) with a `max_tokens` value of 200 for each request.
By keeping the `max_tokens` value high, we should trigger the token-based rate limiting rather than the request-based limiting.

> NOTE: Every 1000 tokens per minute allows 6 requests per minute. Provided the `max_tokens` value used is greater than 1000/6 = 167, the rate-limiting should be triggered by tokens rather than requests.
46 changes: 46 additions & 0 deletions scripts/run-load-test-limits-tokens.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/bin/bash
set -e

#
# Runs a load test with no added latency and no limits
# Used to validate the base latency of the simulator under load.
#
# The script runs a load test in Container Apps
# and then runs follow-up steps to validate the results.
#

script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

# Use deployment with 10k TPM limit
deployment_name="gpt-35-turbo-100k-token"

# Set max tokens high to trigger rate-limiting by tokens not request count
# Since there are 6 RPM per 1000 TPM a max_tokens of 1000/6 = 167 will trigger rate limiting
# Keeping the value relatively makes the validation more granular
max_tokens=200

result=$(\
LOCUST_USERS=30 \
LOCUST_RUN_TIME=3m \
LOCUST_SPAWN_RATE=2 \
TEST_FILE=./test_chat_completions_no_added_latency.py \
DEPLOYMENT_NAME=$deployment_name \
MAX_TOKENS=$max_tokens \
ALLOW_429_RESPONSES=true \
./scripts/_run-load-test-aca.sh)

echo -e "________________\n$result"


test_start_time=$(echo "$result" | jq -r '.start_time')
test_stop_time=$(echo "$result" | jq -r '.end_time')

echo "--test-start-time: '$test_start_time'"
echo "--test-stop-time: '$test_stop_time'"
echo ""
echo "Running post steps"

"$script_dir/_run-load-test-post-steps.sh" \
--test-start-time "$test_start_time" \
--test-stop-time "$test_stop_time" \
--filename ./src/loadtest/post_steps_limits_tokens.py
40 changes: 1 addition & 39 deletions src/aoai-simulated-api/src/aoai_simulated_api/latency.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,12 @@
import asyncio
from dataclasses import dataclass
import time
from fastapi import Response
from opentelemetry import trace, metrics

from aoai_simulated_api import constants
from aoai_simulated_api.metrics import simulator_metrics
from aoai_simulated_api.models import RequestContext


@dataclass
class SimulatorMetrics:
histogram_latency_base: metrics.Histogram
histogram_latency_full: metrics.Histogram
histogram_tokens_used: metrics.Histogram
histogram_tokens_requested: metrics.Histogram


def _get_simulator_metrics() -> SimulatorMetrics:
meter = metrics.get_meter(__name__)
return SimulatorMetrics(
histogram_latency_base=meter.create_histogram(
name="aoai-simulator.latency.base",
description="Latency of handling the request (before adding simulated latency)",
unit="seconds",
),
histogram_latency_full=meter.create_histogram(
name="aoai-simulator.latency.full",
description="Full latency of handling the request (including simulated latency)",
unit="seconds",
),
histogram_tokens_used=meter.create_histogram(
name="aoai-simulator.tokens.used",
description="Number of tokens used per request",
unit="tokens",
),
histogram_tokens_requested=meter.create_histogram(
name="aoai-simulator.tokens.requested",
description="Number of tokens across all requests (success or not)",
unit="tokens",
),
)


simulator_metrics = _get_simulator_metrics()


class LatencyGenerator:
Expand Down Expand Up @@ -93,8 +57,6 @@ async def apply_latency(self):
extra_latency_s = target_duration_s - base_duration_s

if extra_latency_s and extra_latency_s > 0:
current_span = trace.get_current_span()
current_span.set_attribute("simulator.added_latency", extra_latency_s)
await asyncio.sleep(extra_latency_s)

full_end_time = time.perf_counter()
Expand Down
15 changes: 15 additions & 0 deletions src/aoai-simulated-api/src/aoai_simulated_api/limiters.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@


from aoai_simulated_api import constants
from aoai_simulated_api.metrics import simulator_metrics
from aoai_simulated_api.models import Config, RequestContext

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -93,6 +94,13 @@ def limiter(context: RequestContext, response: Response) -> Response:
+ f"Please retry after {retry_after} seconds.",
}
}
simulator_metrics.histogram_rate_limit.record(
1,
attributes={
"deployment": deployment_name,
"reason": "requests_per_10s",
},
)
return Response(
status_code=429,
content=json.dumps(content),
Expand All @@ -109,6 +117,13 @@ def limiter(context: RequestContext, response: Response) -> Response:
+ f"Please retry after {retry_after} seconds.",
}
}
simulator_metrics.histogram_rate_limit.record(
1,
attributes={
"deployment": deployment_name,
"reason": "tokens_per_10s",
},
)
return Response(
status_code=429,
content=json.dumps(content),
Expand Down
50 changes: 50 additions & 0 deletions src/aoai-simulated-api/src/aoai_simulated_api/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from dataclasses import dataclass
from opentelemetry import metrics


@dataclass
class SimulatorMetrics:
histogram_latency_base: metrics.Histogram
histogram_latency_full: metrics.Histogram
histogram_tokens_used: metrics.Histogram
histogram_tokens_requested: metrics.Histogram
histogram_rate_limit: metrics.Histogram


def _get_simulator_metrics() -> SimulatorMetrics:
meter = metrics.get_meter(__name__)
return SimulatorMetrics(
# dimensions: deployment, status_code
histogram_latency_base=meter.create_histogram(
name="aoai-simulator.latency.base",
description="Latency of handling the request (before adding simulated latency)",
unit="seconds",
),
# dimensions: deployment, status_code
histogram_latency_full=meter.create_histogram(
name="aoai-simulator.latency.full",
description="Full latency of handling the request (including simulated latency)",
unit="seconds",
),
# dimensions: deployment, token_type
histogram_tokens_used=meter.create_histogram(
name="aoai-simulator.tokens.used",
description="Number of tokens used per request",
unit="tokens",
),
# dimensions: deployment, token_type
histogram_tokens_requested=meter.create_histogram(
name="aoai-simulator.tokens.requested",
description="Number of tokens across all requests (success or not)",
unit="tokens",
),
# dimensions: deployment, reason
histogram_rate_limit=meter.create_histogram(
name="aoai-simulator.limits",
description="Number of requests that were rate-limited",
unit="requests",
),
)


simulator_metrics = _get_simulator_metrics()
Loading

0 comments on commit 384a317

Please sign in to comment.