Add documention for scheduler metrics. (#13814)

* Add documention for scheduler metrics. * Fix renaming of arangodb_scheduler_num_awake_threads. * Add threshold and troubleshoot info for arangodb_scheduler_low_prio_queue_last_dequeue_time. * Update allMetrics.yaml
arangodb · Mar 26, 2021 · b7f4e7c · b7f4e7c
1 parent 73acba5
commit b7f4e7c
Show file tree

Hide file tree

Showing 19 changed files with 173 additions and 60 deletions.
diff --git a/Documentation/Metrics/allMetrics.yaml b/Documentation/Metrics/allMetrics.yaml
@@ -680,7 +680,7 @@
   unit: us
 - category: RocksDB
   complexity: medium
-  description: 'Histogram of the collection/shard lock acquistion times. Locks will
+  description: 'Histogram of the collection/shard lock acquisition times. Locks will
     be acquired for
 
     all write operations, but not for read operations.
@@ -2601,30 +2601,20 @@
   type: counter
   unit: number
 - category: Scheduler
-  complexity: simple
-  description: 'Number of awake worker threads.
+  complexity: medium
+  description: 'The number of jobs currently queued on the scheduler''s high priority
+    queue.
 
-    '
-  exposedBy:
-  - coordinator
-  - dbserver
-  - agent
-  help: 'Number of awake worker threads.
+    The capacity of the high priority queue can be configured via the startup
 
-    '
-  introducedIn: '3.6'
-  name: arangodb_scheduler_awake_threads
-  type: gauge
-  unit: number
-- category: Scheduler
-  complexity: medium
-  description: 'Current queue length of the high priority queue in the scheduler.
+    option `--server.prio1-size`.
 
     '
   exposedBy:
   - coordinator
   - dbserver
   - agent
+  - single
   help: 'Current queue length of the high priority queue in the scheduler.
 
     '
@@ -2709,29 +2699,51 @@
   unit: number
 - category: Scheduler
   complexity: simple
-  description: 'Last recorded dequeue time for a low priority queue item.
+  description: 'Last recorded dequeue time for a low priority queue item, i.e., the
+    amount of
+
+    time the job was sitting in the queue. If there is nothing to do for a long
+
+    time, this metric will be reset to zero.
+
+    A large value for this metric indicates that the server is under heavy load
+
+    and low priority jobs cannot be dequeued in a timely manner
 
     '
   exposedBy:
   - coordinator
   - dbserver
   - agent
+  - single
   help: 'Last recorded dequeue time for a low priority queue item.
 
     '
   introducedIn: '3.8'
   name: arangodb_scheduler_low_prio_queue_last_dequeue_time
+  threshold: Normally this time should be clearly sub-second.
+  troubleshoot: If you see larger values here, in particular over a longer period
+    of time, you should consider reducing the load of the server (if possible), scaling
+    up (bigger machine) or scaling out (more coordinators). Otherwise requests cannot
+    be processed in a timely manner and you run the risk that the queue becomes full
+    and requests are declined.
   type: gauge
   unit: ms
 - category: Scheduler
   complexity: medium
-  description: 'Current queue length of the low priority queue in the scheduler.
+  description: 'The number of jobs currently queued on the scheduler''s low priority
+    queue.
+
+    The capacity of the low priority queue can be configured via the startup
+
+    option `--server.maximal-queue-size`.
 
     '
   exposedBy:
   - coordinator
   - dbserver
   - agent
+  - single
   help: 'Current queue length of the low priority queue in the scheduler.
 
     '
@@ -2741,13 +2753,21 @@
   unit: number
 - category: Scheduler
   complexity: medium
-  description: 'Current queue length of the maintenance priority queue in the scheduler.
+  description: 'The number of jobs currently queued on the scheduler''s maintenance
+    priority
+
+    queue. These are the jobs with the highest priority and are mainly used for
+
+    cluster internal operations. The capacity of the maintenance priority queue
+
+    can be configured via the startup option `--server.scheduler-queue-size`.
 
     '
   exposedBy:
   - coordinator
   - dbserver
   - agent
+  - single
   help: 'Current queue length of the maintenance priority queue in the scheduler.
 
     '
@@ -2757,13 +2777,19 @@
   unit: number
 - category: Scheduler
   complexity: medium
-  description: 'Current queue length of the medium priority queue in the scheduler.
+  description: 'The number of jobs currently queued on the scheduler''s medium priority
+    queue.
+
+    The capacity of the medium priority queue can be configured via the startup
+
+    option `--server.prio2-size`.
 
     '
   exposedBy:
   - coordinator
   - dbserver
   - agent
+  - single
   help: 'Current queue length of the medium priority queue in the scheduler.
 
     '
@@ -2773,13 +2799,37 @@
   unit: number
 - category: Scheduler
   complexity: simple
-  description: 'Current number of worker threads.
+  description: 'The number of worker threads currently working on some job or spinning
+    while
+
+    waiting for new work (i.e., not sleeping).
 
     '
   exposedBy:
   - coordinator
   - dbserver
   - agent
+  - single
+  help: 'Number of awake worker threads.
+
+    '
+  introducedIn: '3.8'
+  name: arangodb_scheduler_num_awake_threads
+  type: gauge
+  unit: number
+- category: Scheduler
+  complexity: simple
+  description: 'The number of worker threads currently started. Worker threads can
+    be started
+
+    and stopped dynamically based on the server load.
+
+    '
+  exposedBy:
+  - coordinator
+  - dbserver
+  - agent
+  - single
   help: 'Current number of worker threads.
 
     '
@@ -2789,13 +2839,17 @@
   unit: number
 - category: Scheduler
   complexity: simple
-  description: 'Current number of working threads.
+  description: 'The current number of threads actually working on some job (i.e.,
+    not
+
+    spinning while waiting for new work).
 
     '
   exposedBy:
   - coordinator
   - dbserver
   - agent
+  - single
   help: 'Current number of working threads.
 
     '
@@ -2805,13 +2859,14 @@
   unit: number
 - category: Scheduler
   complexity: simple
-  description: 'Total number of ongoing RestHandlers coming from the low prio queue.
+  description: 'Total number of low priority jobs currently being processed.
 
     '
   exposedBy:
   - coordinator
   - dbserver
   - agent
+  - single
   help: 'Total number of ongoing RestHandlers coming from the low prio queue.
 
     '
@@ -2821,13 +2876,19 @@
   unit: number
 - category: Scheduler
   complexity: simple
-  description: 'Number of tasks dropped and not added to internal queue.
+  description: 'Number of tasks dropped because the queue was already full. The queue
+    capacities
+
+    can be configured via the startup options `--server.scheduler-queue-size`,
+
+    `--server.prio1-size`, `--server.prio2-size` and `--server.maximal-queue-size`.
 
     '
   exposedBy:
   - coordinator
   - dbserver
   - agent
+  - single
   help: 'Number of tasks dropped and not added to internal queue.
 
     '
@@ -2837,13 +2898,14 @@
   unit: number
 - category: Scheduler
   complexity: simple
-  description: 'Server''s internal queue length.
+  description: 'The total number of currently queued jobs in all queues.
 
     '
   exposedBy:
   - coordinator
   - dbserver
   - agent
+  - single
   help: 'Server''s internal queue length.
 
     '
@@ -2853,13 +2915,17 @@
   unit: number
 - category: Scheduler
   complexity: medium
-  description: 'Total accumulated number of scheduler threads started.
+  description: 'Total accumulated number of scheduler threads started. Worker threads
+    can be
+
+    started and stopped dynamically based on the server load.
 
     '
   exposedBy:
   - coordinator
   - dbserver
   - agent
+  - single
   help: 'Total accumulated number of scheduler threads started.
 
     '
@@ -2869,13 +2935,17 @@
   unit: number
 - category: Scheduler
   complexity: medium
-  description: 'Accumulated total number of scheduler threads stopped.
+  description: 'Total accumulated number of scheduler threads stopped. Worker threads
+    can be
+
+    started and stopped dynamically based on the server load.
 
     '
   exposedBy:
   - coordinator
   - dbserver
   - agent
+  - single
   help: 'Accumulated total number of scheduler threads stopped.
 
     '

diff --git a/Documentation/Metrics/arangodb_collection_lock_acquisition_time.yaml b/Documentation/Metrics/arangodb_collection_lock_acquisition_time.yaml
@@ -11,7 +11,7 @@ exposedBy:
   - agent
   - single
 description: |
-  Histogram of the collection/shard lock acquistion times. Locks will be acquired for
+  Histogram of the collection/shard lock acquisition times. Locks will be acquired for
   all write operations, but not for read operations.
   The values here are measured in seconds.
 troubleshoot: |

diff --git a/Documentation/Metrics/arangodb_scheduler_awake_threads.yaml b/Documentation/Metrics/arangodb_scheduler_awake_threads.yaml
diff --git a/Documentation/Metrics/arangodb_scheduler_high_prio_queue_length.yaml b/Documentation/Metrics/arangodb_scheduler_high_prio_queue_length.yaml
@@ -10,5 +10,8 @@ exposedBy:
   - coordinator
   - dbserver
   - agent
+  - single
 description: |
-  Current queue length of the high priority queue in the scheduler.
+  The number of jobs currently queued on the scheduler's high priority queue.
+  The capacity of the high priority queue can be configured via the startup
+  option `--server.prio1-size`.
diff --git a/Documentation/Metrics/arangodb_scheduler_low_prio_queue_last_dequeue_time.yaml b/Documentation/Metrics/arangodb_scheduler_low_prio_queue_last_dequeue_time.yaml
@@ -10,5 +10,17 @@ exposedBy:
   - coordinator
   - dbserver
   - agent
+  - single
 description: |
-  Last recorded dequeue time for a low priority queue item.
+  Last recorded dequeue time for a low priority queue item, i.e., the amount of
+  time the job was sitting in the queue. If there is nothing to do for a long
+  time, this metric will be reset to zero.
+  A large value for this metric indicates that the server is under heavy load
+  and low priority jobs cannot be dequeued in a timely manner
+threshold: Normally this time should be clearly sub-second.
+troubleshoot: If you see larger values here, in particular over a longer period
+  of time, you should consider reducing the load of the server (if possible),
+  scaling up (bigger machine) or scaling out (more coordinators). Otherwise
+  requests cannot be processed in a timely manner and you run the risk that the
+  queue becomes full and requests are declined.
+
diff --git a/Documentation/Metrics/arangodb_scheduler_low_prio_queue_length.yaml b/Documentation/Metrics/arangodb_scheduler_low_prio_queue_length.yaml
@@ -10,5 +10,8 @@ exposedBy:
   - coordinator
   - dbserver
   - agent
+  - single
 description: |
-  Current queue length of the low priority queue in the scheduler.
+  The number of jobs currently queued on the scheduler's low priority queue.
+  The capacity of the low priority queue can be configured via the startup
+  option `--server.maximal-queue-size`.
diff --git a/Documentation/Metrics/arangodb_scheduler_maintenance_prio_queue_length.yaml b/Documentation/Metrics/arangodb_scheduler_maintenance_prio_queue_length.yaml
@@ -10,5 +10,9 @@ exposedBy:
   - coordinator
   - dbserver
   - agent
+  - single
 description: |
-  Current queue length of the maintenance priority queue in the scheduler.
+  The number of jobs currently queued on the scheduler's maintenance priority
+  queue. These are the jobs with the highest priority and are mainly used for
+  cluster internal operations. The capacity of the maintenance priority queue
+  can be configured via the startup option `--server.scheduler-queue-size`.
diff --git a/Documentation/Metrics/arangodb_scheduler_medium_prio_queue_length.yaml b/Documentation/Metrics/arangodb_scheduler_medium_prio_queue_length.yaml
@@ -10,5 +10,8 @@ exposedBy:
   - coordinator
   - dbserver
   - agent
+  - single
 description: |
-  Current queue length of the medium priority queue in the scheduler.
+  The number of jobs currently queued on the scheduler's medium priority queue.
+  The capacity of the medium priority queue can be configured via the startup
+  option `--server.prio2-size`.
diff --git a/Documentation/Metrics/arangodb_scheduler_num_awake_threads.yaml b/Documentation/Metrics/arangodb_scheduler_num_awake_threads.yaml
@@ -0,0 +1,16 @@
+name: arangodb_scheduler_num_awake_threads
+introducedIn: "3.8"
+help: |
+  Number of awake worker threads.
+unit: number
+type: gauge
+category: Scheduler
+complexity: simple
+exposedBy:
+  - coordinator
+  - dbserver
+  - agent
+  - single
+description: |
+  The number of worker threads currently working on some job or spinning while
+  waiting for new work (i.e., not sleeping).