forked from galaxyproject/galaxy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathruntime_stats.py
executable file
·271 lines (225 loc) · 8.42 KB
/
runtime_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
#!/usr/bin/env python
"""
Collect and report statistics on job run times
To use metrics (which provide more accurate information), see:
https://github.com/galaxyproject/galaxy/blob/dev/config/job_metrics_conf.xml.sample
If you do not have metrics enabled, use the `--source history` option to use
the less accurate job_state_history table.
Examples
--------
# Stats for Nate's runs of the Bowtie 2 tool installed from the Tool Shed (all
# versions):
% ./runtime_stats.py -c galaxy.ini -u nate@bx.psu.edu 'toolshed.g2.bx.psu.edu/repos/devteam/bowtie2/bowtie2/'
# Stats for all runs of the Bowtie 2 tool installed from the Tool Shed (version
# 0.4 only):
% ./runtime_stats.py -c galaxy.ini 'toolshed.g2.bx.psu.edu/repos/devteam/bowtie2/bowtie2/0.4'
# Stats for all runs of the Bowtie 2 tool installed from the Tool Shed but we
# don't feel like figuring out or typing the long ID (matches any tool with
# '/bowtie2/' in its full ID):
% ./runtime_stats.py -c galaxy.ini --like 'bowtie2'
# Stats for all runs of Tophat 2 that took longer than 2 minutes but less than
# 2 days:
% ./runtime_stats.py -c galaxy.ini --like -m $((2 * 60)) -M $((2 * 24 * 60 * 60)) 'tophat2'
"""
import argparse
import os
import re
import sys
import numpy
import psycopg2
from sqlalchemy.engine import url
galaxy_root = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
sys.path.insert(1, os.path.join(galaxy_root, 'lib'))
import galaxy.config
from galaxy.util.script import app_properties_from_args, populate_config_args
DATA_SOURCES = ('metrics', 'history')
METRICS_SQL = """
SELECT metric_value
FROM job_metric_numeric jmn
JOIN job j ON jmn.job_id = j.id
WHERE j.state = 'ok'
AND jmn.plugin = 'core'
AND jmn.metric_name = 'runtime_seconds'
{tool_clause}
{user_clause}
{time_clause}
"""
HISTORY_SQL = """
SELECT ctimes[1] - ctimes[2] AS delta
FROM (SELECT jsh.job_id,
array_agg(jsh.create_time ORDER BY jsh.create_time DESC) AS ctimes
FROM job_state_history jsh
JOIN job j ON jsh.job_id = j.id
WHERE jsh.state IN ('running','ok')
AND j.state = 'ok'
{tool_clause}
{user_clause}
GROUP BY jsh.job_id) AS t_arrs
{time_clause}
"""
def parse_arguments():
parser = argparse.ArgumentParser(
description='Generate walltime statistics')
parser.add_argument('tool_id', help='Tool (by ID) to collect stats about')
parser.add_argument('--like',
action='store_true',
default=False,
help='Use SQL `LIKE` operator to find '
'a shed-installed tool using the tool\'s '
'"short" id')
populate_config_args(parser)
parser.add_argument('-d', '--debug',
action='store_true',
default=False,
help='Print extra info')
parser.add_argument('-m', '--min',
type=int,
default=-1,
help='Ignore runtimes less than MIN seconds')
parser.add_argument('-M', '--max',
type=int,
default=-1,
help='Ignore runtimes greater than MAX seconds')
parser.add_argument('-u', '--user',
help='Return stats for only this user (id, email, '
'or username)')
parser.add_argument('-s', '--source',
default='metrics',
help='Runtime data source (SOURCES: %s)'
% ', '.join(DATA_SOURCES))
args = parser.parse_args()
if args.like and '/' in args.tool_id:
print('ERROR: Do not use --like with a tool shed tool id (the tool '
'id should not contain `/` characters)')
sys.exit(2)
args.source = args.source.lower()
if args.source not in ('metrics', 'history'):
print('ERROR: Data source `%s` unknown, valid source are: %s'
% (args.source, ', '.join(DATA_SOURCES)))
app_properties = app_properties_from_args(args)
config = galaxy.config.Configuration(**app_properties)
uri = args.config.get_database_url(config)
names = {'database': 'dbname', 'username': 'user'}
args.connect_args = url.make_url(uri).translate_connect_args(**names)
if args.debug:
print('Got options:')
for i in vars(args).items():
print('%s: %s' % i)
return args
def query(tool_id=None, user=None, like=None, source='metrics',
connect_args=None, debug=False, min=-1, max=-1, **kwargs):
connect_arg_str = ''
for k, v in connect_args.items():
connect_arg_str += f'{k}={v}'
pc = psycopg2.connect(connect_arg_str)
cur = pc.cursor()
if user:
try:
user_id = int(user)
except ValueError:
if '@' not in user:
field = 'username'
else:
field = 'email'
sql = 'SELECT id FROM galaxy_user WHERE {} = {}'.format(field, '%s')
cur.execute(sql, (user,))
if debug:
print('Executed:')
print(cur.query)
row = cur.fetchone()
if row:
user_id = row[0]
else:
print('Invalid user: %s' % user)
sys.exit(1)
if like:
query_tool_id = '%%/%s/%%' % tool_id
elif '/' in tool_id and not re.match(r'\d+\.\d+', tool_id.split('/')[-1]):
query_tool_id = '%s%%' % tool_id
like = True
else:
query_tool_id = tool_id
sql_args = [query_tool_id]
if like:
tool_clause = "AND j.tool_id LIKE %s"
else:
tool_clause = "AND j.tool_id = %s"
if user:
user_clause = "AND j.user_id = %s"
sql_args.append(user_id)
else:
user_clause = ""
if source == 'metrics':
if min > 0 and max > 0:
time_clause = """AND metric_value > %s
AND metric_value < %s"""
sql_args.append(min)
sql_args.append(max)
elif min > 0:
time_clause = "AND metric_value > %s"
sql_args.append(min)
elif max > 0:
time_clause = "AND metric_value < %s"
sql_args.append(max)
else:
time_clause = ""
sql = METRICS_SQL
elif source == 'history':
if min > 0 and max > 0:
time_clause = """WHERE ctimes[1] - ctimes[2] > interval %s
AND ctimes[1] - ctimes[2] < interval %s"""
sql_args.append('%s seconds' % min)
sql_args.append('%s seconds' % max)
elif min > 0:
time_clause = "WHERE ctimes[1] - ctimes[2] > interval %s"
sql_args.append('%s seconds' % min)
elif max > 0:
time_clause = "WHERE ctimes[1] - ctimes[2] < interval %s"
sql_args.append('%s seconds' % max)
else:
time_clause = ""
sql = HISTORY_SQL
sql = sql.format(tool_clause=tool_clause, user_clause=user_clause,
time_clause=time_clause)
cur.execute(sql, sql_args)
if debug:
print('Executed:')
print(cur.query)
print('Query returned %d rows' % cur.rowcount)
if source == 'metrics':
times = numpy.array([r[0] for r in cur if r[0]])
elif source == 'history':
times = numpy.array([r[0].total_seconds() for r in cur if r[0]])
print('Collected %d times' % times.size)
if times.size == 0:
return
if user:
print('Displaying statistics for user %s' % user)
stats = (('Mean runtime', numpy.mean(times)),
('Standard deviation', numpy.std(times)),
('Minimum runtime', times.min()),
('Maximum runtime', times.max()))
for name, seconds in stats:
hours, minutes = nice_times(seconds)
msg = name + ' is %0.0f seconds' % seconds
if minutes:
msg += ' (=%0.2f minutes)' % minutes
if hours:
msg += ' (=%0.2f hours)' % hours
print(msg)
def nice_times(seconds):
if seconds < 60 * 60:
hours = None
if seconds < 60:
minutes = None
else:
minutes = seconds / 60
else:
minutes = seconds / 60
hours = seconds / 60 / 60
return hours, minutes
def main():
args = parse_arguments()
query(**vars(args))
if __name__ == '__main__':
main()