Add token-based limiting load test

mluker · Jun 24, 2024 · 384a317 · 384a317
1 parent ad282b7
commit 384a317
Show file tree

Hide file tree

Showing 6 changed files with 340 additions and 39 deletions.
diff --git a/docs/developing.md b/docs/developing.md
@@ -65,3 +65,14 @@ This test uses 30 test users (i.e. ~30RPS) with a `max_tokens` value of 10 for e
 By keeping the `max_tokens` value low, we should trigger the request-based rate limiting rather than the token-based limiting.
 
 
+#### Load test: Token limiting (no added latency, 100,000 tokens per minute)
+
+To run this test, run `./scripts/run-load-test-limits-tokens.sh`.
+
+The simulator endpoint used in this test is configured for 100,000 tokens per minute.
+This equates to ~16,667 tokens per 10 second window.
+
+This test uses 30 test users (i.e. ~30RPS) with a `max_tokens` value of 200 for each request.
+By keeping the `max_tokens` value high, we should trigger the token-based rate limiting rather than the request-based limiting.
+
+> NOTE: Every 1000 tokens per minute allows 6 requests per minute. Provided the `max_tokens` value used is greater than 1000/6 = 167, the rate-limiting should be triggered by tokens rather than requests.
diff --git a/scripts/run-load-test-limits-tokens.sh b/scripts/run-load-test-limits-tokens.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+set -e
+
+#
+# Runs a load test with no added latency and no limits
+# Used to validate the base latency of the simulator under load.
+#
+# The script runs a load test in Container Apps 
+# and then runs follow-up steps to validate the results.
+#
+
+script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+# Use deployment with 10k TPM limit
+deployment_name="gpt-35-turbo-100k-token"
+
+# Set max tokens high to trigger rate-limiting by tokens not request count
+# Since there are 6 RPM per 1000 TPM a max_tokens of 1000/6 = 167 will trigger rate limiting
+# Keeping the value relatively makes the validation more granular
+max_tokens=200
+
+result=$(\
+  LOCUST_USERS=30 \
+  LOCUST_RUN_TIME=3m \
+  LOCUST_SPAWN_RATE=2 \
+  TEST_FILE=./test_chat_completions_no_added_latency.py \
+  DEPLOYMENT_NAME=$deployment_name \
+  MAX_TOKENS=$max_tokens \
+  ALLOW_429_RESPONSES=true \
+  ./scripts/_run-load-test-aca.sh)
+
+echo -e "________________\n$result"
+
+
+test_start_time=$(echo "$result" | jq -r '.start_time')
+test_stop_time=$(echo "$result" | jq -r '.end_time')
+
+echo "--test-start-time: '$test_start_time'"
+echo "--test-stop-time: '$test_stop_time'"
+echo ""
+echo "Running post steps"
+
+"$script_dir/_run-load-test-post-steps.sh" \
+  --test-start-time "$test_start_time" \
+  --test-stop-time "$test_stop_time" \
+  --filename ./src/loadtest/post_steps_limits_tokens.py
diff --git a/src/aoai-simulated-api/src/aoai_simulated_api/latency.py b/src/aoai-simulated-api/src/aoai_simulated_api/latency.py
@@ -1,48 +1,12 @@
 import asyncio
-from dataclasses import dataclass
 import time
 from fastapi import Response
-from opentelemetry import trace, metrics
 
 from aoai_simulated_api import constants
+from aoai_simulated_api.metrics import simulator_metrics
 from aoai_simulated_api.models import RequestContext
 
 
-@dataclass
-class SimulatorMetrics:
-    histogram_latency_base: metrics.Histogram
-    histogram_latency_full: metrics.Histogram
-    histogram_tokens_used: metrics.Histogram
-    histogram_tokens_requested: metrics.Histogram
-
-
-def _get_simulator_metrics() -> SimulatorMetrics:
-    meter = metrics.get_meter(__name__)
-    return SimulatorMetrics(
-        histogram_latency_base=meter.create_histogram(
-            name="aoai-simulator.latency.base",
-            description="Latency of handling the request (before adding simulated latency)",
-            unit="seconds",
-        ),
-        histogram_latency_full=meter.create_histogram(
-            name="aoai-simulator.latency.full",
-            description="Full latency of handling the request (including simulated latency)",
-            unit="seconds",
-        ),
-        histogram_tokens_used=meter.create_histogram(
-            name="aoai-simulator.tokens.used",
-            description="Number of tokens used per request",
-            unit="tokens",
-        ),
-        histogram_tokens_requested=meter.create_histogram(
-            name="aoai-simulator.tokens.requested",
-            description="Number of tokens across all requests (success or not)",
-            unit="tokens",
-        ),
-    )
-
-
-simulator_metrics = _get_simulator_metrics()
 
 
 class LatencyGenerator:
@@ -93,8 +57,6 @@ async def apply_latency(self):
                 extra_latency_s = target_duration_s - base_duration_s
 
         if extra_latency_s and extra_latency_s > 0:
-            current_span = trace.get_current_span()
-            current_span.set_attribute("simulator.added_latency", extra_latency_s)
             await asyncio.sleep(extra_latency_s)
 
         full_end_time = time.perf_counter()

diff --git a/src/aoai-simulated-api/src/aoai_simulated_api/limiters.py b/src/aoai-simulated-api/src/aoai_simulated_api/limiters.py
@@ -10,6 +10,7 @@
 
 
 from aoai_simulated_api import constants
+from aoai_simulated_api.metrics import simulator_metrics
 from aoai_simulated_api.models import Config, RequestContext
 
 logger = logging.getLogger(__name__)
@@ -93,6 +94,13 @@ def limiter(context: RequestContext, response: Response) -> Response:
                     + f"Please retry after {retry_after} seconds.",
                 }
             }
+            simulator_metrics.histogram_rate_limit.record(
+                    1,
+                    attributes={
+                        "deployment": deployment_name,
+                        "reason": "requests_per_10s",
+                    },
+                )
             return Response(
                 status_code=429,
                 content=json.dumps(content),
@@ -109,6 +117,13 @@ def limiter(context: RequestContext, response: Response) -> Response:
                     + f"Please retry after {retry_after} seconds.",
                 }
             }
+            simulator_metrics.histogram_rate_limit.record(
+                    1,
+                    attributes={
+                        "deployment": deployment_name,
+                        "reason": "tokens_per_10s",
+                    },
+                )
             return Response(
                 status_code=429,
                 content=json.dumps(content),

diff --git a/src/aoai-simulated-api/src/aoai_simulated_api/metrics.py b/src/aoai-simulated-api/src/aoai_simulated_api/metrics.py
@@ -0,0 +1,50 @@
+from dataclasses import dataclass
+from opentelemetry import metrics
+
+
+@dataclass
+class SimulatorMetrics:
+    histogram_latency_base: metrics.Histogram
+    histogram_latency_full: metrics.Histogram
+    histogram_tokens_used: metrics.Histogram
+    histogram_tokens_requested: metrics.Histogram
+    histogram_rate_limit: metrics.Histogram
+
+
+def _get_simulator_metrics() -> SimulatorMetrics:
+    meter = metrics.get_meter(__name__)
+    return SimulatorMetrics(
+        # dimensions: deployment, status_code
+        histogram_latency_base=meter.create_histogram(
+            name="aoai-simulator.latency.base",
+            description="Latency of handling the request (before adding simulated latency)",
+            unit="seconds",
+        ),
+        # dimensions: deployment, status_code
+        histogram_latency_full=meter.create_histogram(
+            name="aoai-simulator.latency.full",
+            description="Full latency of handling the request (including simulated latency)",
+            unit="seconds",
+        ),
+        # dimensions: deployment, token_type
+        histogram_tokens_used=meter.create_histogram(
+            name="aoai-simulator.tokens.used",
+            description="Number of tokens used per request",
+            unit="tokens",
+        ),
+        # dimensions: deployment, token_type
+        histogram_tokens_requested=meter.create_histogram(
+            name="aoai-simulator.tokens.requested",
+            description="Number of tokens across all requests (success or not)",
+            unit="tokens",
+        ),
+        # dimensions: deployment, reason
+        histogram_rate_limit=meter.create_histogram(
+            name="aoai-simulator.limits",
+            description="Number of requests that were rate-limited",
+            unit="requests",
+        ),
+    )
+
+
+simulator_metrics = _get_simulator_metrics()