diff --git a/docs/developing.md b/docs/developing.md
index 4535ab7..23571c2 100644
--- a/docs/developing.md
+++ b/docs/developing.md
@@ -65,3 +65,14 @@ This test uses 30 test users (i.e. ~30RPS) with a `max_tokens` value of 10 for e
 By keeping the `max_tokens` value low, we should trigger the request-based rate limiting rather than the token-based limiting.
 
 
+#### Load test: Token limiting (no added latency, 100,000 tokens per minute)
+
+To run this test, run `./scripts/run-load-test-limits-tokens.sh`.
+
+The simulator endpoint used in this test is configured for 100,000 tokens per minute.
+This equates to ~16,667 tokens per 10 second window.
+
+This test uses 30 test users (i.e. ~30RPS) with a `max_tokens` value of 200 for each request.
+By keeping the `max_tokens` value high, we should trigger the token-based rate limiting rather than the request-based limiting.
+
+> NOTE: Every 1000 tokens per minute allows 6 requests per minute. Provided the `max_tokens` value used is greater than 1000/6 = 167, the rate-limiting should be triggered by tokens rather than requests.
diff --git a/scripts/run-load-test-limits-tokens.sh b/scripts/run-load-test-limits-tokens.sh
new file mode 100755
index 0000000..f86de31
--- /dev/null
+++ b/scripts/run-load-test-limits-tokens.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+set -e
+
+#
+# Runs a load test with no added latency and no limits
+# Used to validate the base latency of the simulator under load.
+#
+# The script runs a load test in Container Apps 
+# and then runs follow-up steps to validate the results.
+#
+
+script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+# Use deployment with 10k TPM limit
+deployment_name="gpt-35-turbo-100k-token"
+
+# Set max tokens high to trigger rate-limiting by tokens not request count
+# Since there are 6 RPM per 1000 TPM a max_tokens of 1000/6 = 167 will trigger rate limiting
+# Keeping the value relatively makes the validation more granular
+max_tokens=200
+
+result=$(\
+  LOCUST_USERS=30 \
+  LOCUST_RUN_TIME=3m \
+  LOCUST_SPAWN_RATE=2 \
+  TEST_FILE=./test_chat_completions_no_added_latency.py \
+  DEPLOYMENT_NAME=$deployment_name \
+  MAX_TOKENS=$max_tokens \
+  ALLOW_429_RESPONSES=true \
+  ./scripts/_run-load-test-aca.sh)
+
+echo -e "________________\n$result"
+
+
+test_start_time=$(echo "$result" | jq -r '.start_time')
+test_stop_time=$(echo "$result" | jq -r '.end_time')
+
+echo "--test-start-time: '$test_start_time'"
+echo "--test-stop-time: '$test_stop_time'"
+echo ""
+echo "Running post steps"
+
+"$script_dir/_run-load-test-post-steps.sh" \
+  --test-start-time "$test_start_time" \
+  --test-stop-time "$test_stop_time" \
+  --filename ./src/loadtest/post_steps_limits_tokens.py
diff --git a/src/aoai-simulated-api/src/aoai_simulated_api/latency.py b/src/aoai-simulated-api/src/aoai_simulated_api/latency.py
index 26e3d1a..0034529 100644
--- a/src/aoai-simulated-api/src/aoai_simulated_api/latency.py
+++ b/src/aoai-simulated-api/src/aoai_simulated_api/latency.py
@@ -1,48 +1,12 @@
 import asyncio
-from dataclasses import dataclass
 import time
 from fastapi import Response
-from opentelemetry import trace, metrics
 
 from aoai_simulated_api import constants
+from aoai_simulated_api.metrics import simulator_metrics
 from aoai_simulated_api.models import RequestContext
 
 
-@dataclass
-class SimulatorMetrics:
-    histogram_latency_base: metrics.Histogram
-    histogram_latency_full: metrics.Histogram
-    histogram_tokens_used: metrics.Histogram
-    histogram_tokens_requested: metrics.Histogram
-
-
-def _get_simulator_metrics() -> SimulatorMetrics:
-    meter = metrics.get_meter(__name__)
-    return SimulatorMetrics(
-        histogram_latency_base=meter.create_histogram(
-            name="aoai-simulator.latency.base",
-            description="Latency of handling the request (before adding simulated latency)",
-            unit="seconds",
-        ),
-        histogram_latency_full=meter.create_histogram(
-            name="aoai-simulator.latency.full",
-            description="Full latency of handling the request (including simulated latency)",
-            unit="seconds",
-        ),
-        histogram_tokens_used=meter.create_histogram(
-            name="aoai-simulator.tokens.used",
-            description="Number of tokens used per request",
-            unit="tokens",
-        ),
-        histogram_tokens_requested=meter.create_histogram(
-            name="aoai-simulator.tokens.requested",
-            description="Number of tokens across all requests (success or not)",
-            unit="tokens",
-        ),
-    )
-
-
-simulator_metrics = _get_simulator_metrics()
 
 
 class LatencyGenerator:
@@ -93,8 +57,6 @@ async def apply_latency(self):
                 extra_latency_s = target_duration_s - base_duration_s
 
         if extra_latency_s and extra_latency_s > 0:
-            current_span = trace.get_current_span()
-            current_span.set_attribute("simulator.added_latency", extra_latency_s)
             await asyncio.sleep(extra_latency_s)
 
         full_end_time = time.perf_counter()
diff --git a/src/aoai-simulated-api/src/aoai_simulated_api/limiters.py b/src/aoai-simulated-api/src/aoai_simulated_api/limiters.py
index 7b06f3d..be61744 100644
--- a/src/aoai-simulated-api/src/aoai_simulated_api/limiters.py
+++ b/src/aoai-simulated-api/src/aoai_simulated_api/limiters.py
@@ -10,6 +10,7 @@
 
 
 from aoai_simulated_api import constants
+from aoai_simulated_api.metrics import simulator_metrics
 from aoai_simulated_api.models import Config, RequestContext
 
 logger = logging.getLogger(__name__)
@@ -93,6 +94,13 @@ def limiter(context: RequestContext, response: Response) -> Response:
                     + f"Please retry after {retry_after} seconds.",
                 }
             }
+            simulator_metrics.histogram_rate_limit.record(
+                    1,
+                    attributes={
+                        "deployment": deployment_name,
+                        "reason": "requests_per_10s",
+                    },
+                )
             return Response(
                 status_code=429,
                 content=json.dumps(content),
@@ -109,6 +117,13 @@ def limiter(context: RequestContext, response: Response) -> Response:
                     + f"Please retry after {retry_after} seconds.",
                 }
             }
+            simulator_metrics.histogram_rate_limit.record(
+                    1,
+                    attributes={
+                        "deployment": deployment_name,
+                        "reason": "tokens_per_10s",
+                    },
+                )
             return Response(
                 status_code=429,
                 content=json.dumps(content),
diff --git a/src/aoai-simulated-api/src/aoai_simulated_api/metrics.py b/src/aoai-simulated-api/src/aoai_simulated_api/metrics.py
new file mode 100644
index 0000000..1c3254f
--- /dev/null
+++ b/src/aoai-simulated-api/src/aoai_simulated_api/metrics.py
@@ -0,0 +1,50 @@
+from dataclasses import dataclass
+from opentelemetry import metrics
+
+
+@dataclass
+class SimulatorMetrics:
+    histogram_latency_base: metrics.Histogram
+    histogram_latency_full: metrics.Histogram
+    histogram_tokens_used: metrics.Histogram
+    histogram_tokens_requested: metrics.Histogram
+    histogram_rate_limit: metrics.Histogram
+
+
+def _get_simulator_metrics() -> SimulatorMetrics:
+    meter = metrics.get_meter(__name__)
+    return SimulatorMetrics(
+        # dimensions: deployment, status_code
+        histogram_latency_base=meter.create_histogram(
+            name="aoai-simulator.latency.base",
+            description="Latency of handling the request (before adding simulated latency)",
+            unit="seconds",
+        ),
+        # dimensions: deployment, status_code
+        histogram_latency_full=meter.create_histogram(
+            name="aoai-simulator.latency.full",
+            description="Full latency of handling the request (including simulated latency)",
+            unit="seconds",
+        ),
+        # dimensions: deployment, token_type
+        histogram_tokens_used=meter.create_histogram(
+            name="aoai-simulator.tokens.used",
+            description="Number of tokens used per request",
+            unit="tokens",
+        ),
+        # dimensions: deployment, token_type
+        histogram_tokens_requested=meter.create_histogram(
+            name="aoai-simulator.tokens.requested",
+            description="Number of tokens across all requests (success or not)",
+            unit="tokens",
+        ),
+        # dimensions: deployment, reason
+        histogram_rate_limit=meter.create_histogram(
+            name="aoai-simulator.limits",
+            description="Number of requests that were rate-limited",
+            unit="requests",
+        ),
+    )
+
+
+simulator_metrics = _get_simulator_metrics()
diff --git a/src/loadtest/post_steps_limits_tokens.py b/src/loadtest/post_steps_limits_tokens.py
new file mode 100644
index 0000000..5adfbb1
--- /dev/null
+++ b/src/loadtest/post_steps_limits_tokens.py
@@ -0,0 +1,217 @@
+from datetime import UTC, datetime, timedelta
+import os
+import logging
+
+import asciichartpy as asciichart
+from azure.identity import DefaultAzureCredential
+
+from common.config import (
+    tenant_id,
+    subscription_id,
+    resource_group_name,
+    log_analytics_workspace_id,
+    log_analytics_workspace_name,
+)
+
+from common.log_analytics import QueryProcessor, Table
+
+logging.basicConfig(level=logging.INFO)
+logging.getLogger("azure").setLevel(logging.WARNING)
+
+
+start_time_string = os.getenv("TEST_START_TIME")
+stop_time_string = os.getenv("TEST_STOP_TIME")
+
+test_start_time = datetime.strptime(start_time_string, "%Y-%m-%dT%H:%M:%SZ")
+test_stop_time = datetime.strptime(stop_time_string, "%Y-%m-%dT%H:%M:%SZ")
+
+print(f"test_start_time  : {test_start_time}")
+print(f"test_end_time    : {test_stop_time}")
+
+
+metric_check_time = test_stop_time - timedelta(seconds=40)  # detecting the end of the test can take 30s, add 10s buffer
+
+query_processor = QueryProcessor(
+    workspace_id=log_analytics_workspace_id,
+    token_credential=DefaultAzureCredential(),
+    tenant_id=tenant_id,
+    subscription_id=subscription_id,
+    resource_group_name=resource_group_name,
+    workspace_name=log_analytics_workspace_name,
+)
+
+print(f"metric_check_time: {metric_check_time}")
+check_results_query = f"""
+AppMetrics
+| where TimeGenerated >= datetime({metric_check_time.strftime('%Y-%m-%dT%H:%M:%SZ')})
+    and TimeGenerated <= datetime({test_stop_time.strftime('%Y-%m-%dT%H:%M:%SZ')})
+	and Name == "locust.request_latency"
+| count
+"""
+query_processor.wait_for_non_zero_count(check_results_query)
+
+timespan = (datetime.now(UTC) - timedelta(days=1), datetime.now(UTC))
+
+
+####################################################################
+# Ensure the base latency remains low with rate-limiting in place
+#
+
+
+def validate_request_latency(table: Table):
+    mean_latency = table.rows[0][0]
+    if mean_latency > 10:
+        return f"Mean latency is too high: {mean_latency}"
+    return None
+
+
+query_processor.add_query(
+    title="Base Latency",
+    query=f"""
+AppMetrics
+| where TimeGenerated >= datetime({test_start_time.strftime('%Y-%m-%dT%H:%M:%SZ')})
+    and TimeGenerated <= datetime({test_stop_time.strftime('%Y-%m-%dT%H:%M:%SZ')})
+    and Name == "aoai-simulator.latency.base"
+| summarize Sum=sum(Sum),  Count = sum(ItemCount), Max=max(Max)
+| project mean_latency_ms=1000*Sum/Count, max_latency_ms=1000*Max
+""".strip(),
+    timespan=timespan,
+    show_query=True,
+    include_link=True,
+    validation_func=validate_request_latency,
+)
+
+
+####################################################################
+# Ensure the rate-limiting allows the expected tokens per 10s period
+#
+
+
+def validate_mean_tokens_used_per_10s(table: Table):
+    # Check if the mean RPS is within the expected range
+    # The deployment for the tests has 100,000 Tokens Per Minute (TPM) limit
+    # That equates to ~16,667 tokens per 10s period
+    mean_tokens_per_10s = table.rows[0][0]
+    if mean_tokens_per_10s > 17000:
+        return f"Mean tokens per 10s is too high: {mean_tokens_per_10s}"
+    if mean_tokens_per_10s < 16000:
+        return f"Mean tokens per 10s is too low: {mean_tokens_per_10s}"
+    return None
+
+
+query_processor.add_query(
+    title="Mean Tokens Per 10s (successful requests)",
+    query=f"""
+AppMetrics
+| where TimeGenerated >= datetime({test_start_time.strftime('%Y-%m-%dT%H:%M:%SZ')})
+    and TimeGenerated <= datetime({test_stop_time.strftime('%Y-%m-%dT%H:%M:%SZ')})
+    and Name == "aoai-simulator.tokens.used"
+| extend deployment = tostring(Properties["deployment"])
+| summarize total_token_count = sum(Sum) by bin(TimeGenerated, 10s)
+| summarize avg_tokens_per_10s = avg(total_token_count)
+""".strip(),
+    timespan=timespan,
+    show_query=True,
+    include_link=True,
+    validation_func=validate_mean_tokens_used_per_10s,
+)
+
+
+####################################################################
+# Ensure that we _do_ get 429 responses as expected
+#
+
+
+def validate_429_count(table: Table):
+    # With the level of user load targetting the deployment, we expect a high number of 429 responses
+    number_of_429_responses = table.rows[0][0]
+    if number_of_429_responses < 100:
+        return f"The number of 429 responses is too low: {number_of_429_responses}"
+    return None
+
+
+query_processor.add_query(
+    title="Number of 429 responses (should be high)",
+    query=f"""
+AppMetrics
+| where TimeGenerated >= datetime({test_start_time.strftime('%Y-%m-%dT%H:%M:%SZ')})
+    and TimeGenerated <= datetime({test_stop_time.strftime('%Y-%m-%dT%H:%M:%SZ')})
+    and Name == "aoai-simulator.latency.base"
+| extend status_code = Properties["status_code"]
+| where status_code == 429
+| summarize ItemCount=sum(ItemCount)
+""".strip(),
+    timespan=timespan,
+    show_query=True,
+    include_link=True,
+    validation_func=validate_429_count,
+)
+
+
+####################################################################
+# Show the RPS over time
+#
+
+query_processor.add_query(
+    title="Tokens per 10s over time (Used - yellow, Requested - blue)",
+    query=f"""
+AppMetrics
+| where TimeGenerated >= datetime({test_start_time.strftime('%Y-%m-%dT%H:%M:%SZ')})
+    and TimeGenerated <= datetime({test_stop_time.strftime('%Y-%m-%dT%H:%M:%SZ')})
+    and (Name == "aoai-simulator.tokens.used" or Name == "aoai-simulator.tokens.requested")
+| extend deployment = tostring(Properties["deployment"])
+| summarize total_token_count = sum(Sum) by bin(TimeGenerated, 10s), Name
+| evaluate pivot(Name, sum(total_token_count))
+| render timechart | render timechart 
+""".strip(),
+    is_chart=True,
+    columns=["aoai-simulator.tokens.used", "aoai-simulator.tokens.requested"],
+    chart_config={
+        "height": 15,
+        "min": 0,
+        "colors": [
+            asciichart.yellow,
+            asciichart.blue,
+        ],
+    },
+    timespan=timespan,
+    show_query=True,
+    include_link=True,
+)
+
+
+####################################################################
+# Show the RPS over time
+#
+
+query_processor.add_query(
+    title="Latency (base) over time in ms (mean - yellow, max - blue)",
+    query=f"""
+AppMetrics
+| where TimeGenerated >= datetime({test_start_time.strftime('%Y-%m-%dT%H:%M:%SZ')})
+    and TimeGenerated <= datetime({test_stop_time.strftime('%Y-%m-%dT%H:%M:%SZ')})
+    and Name == "aoai-simulator.latency.base"
+| summarize Sum=sum(Sum),  Count = sum(ItemCount), Max=max(Max) by bin(TimeGenerated, 10s)
+| project TimeGenerated, mean_latency_ms=1000*Sum/Count, max_latency_ms=1000*Max
+| render timechart
+""".strip(),
+    is_chart=True,
+    columns=["mean_latency_ms", "max_latency_ms"],
+    chart_config={
+        "height": 15,
+        "min": 0,
+        "colors": [
+            asciichart.yellow,
+            asciichart.blue,
+        ],
+    },
+    timespan=timespan,
+    show_query=True,
+    include_link=True,
+)
+
+
+query_errors = query_processor.run_queries()
+
+if query_errors:
+    exit(1)