diff --git a/README.rst b/README.rst index 878c0c4..92b366a 100644 --- a/README.rst +++ b/README.rst @@ -2,9 +2,9 @@ Fyrd #### -One liner script and function submission to torque, slurm, or a local machines -with dependency tracking using python. Uses the same syntax irrespective of -cluster environment! +One liner script and function submission to torque or slurm clusters with +dependency tracking using python. Uses the same syntax irrespective of cluster +environment! Learn more at https://fyrd.science, https://fyrd.rtfd.com, and https://github.com/MikeDacre/fyrd @@ -20,7 +20,7 @@ https://github.com/MikeDacre/fyrd +---------+----------------------------------------------------+ | License | MIT License, property of Stanford, use as you wish | +---------+----------------------------------------------------+ -| Version | 0.6.1b9 | +| Version | 0.6.2a1 | +---------+----------------------------------------------------+ @@ -134,9 +134,7 @@ can be done like this: jobs = [] for i in huge_list: jobs.append(fyrd.Job(my_function, (i,), profile='small').submit()) - results = [] - for i in jobs: - results.append(i.get()) + results = fyrd.get(jobs) The results list in this example will contain the function outputs, even if those outputs are integers, objects, or other Python types. Similarly, shell @@ -148,10 +146,9 @@ scripts can be run like this: jobs = [] for i in [i for i in os.listdir('.') if i.endswith('.gz')]: jobs.append(fyrd.Job(script.format(i), profile='long').submit()) - results = [] - for i in jobs: - i.wait() - results.append(i.stdout) + results = fyrd.get(jobs) + for i in results: + print(i.stdout) Results will contain the contents of STDOUT for the submitted script @@ -295,6 +292,7 @@ This software requires the following external modules: - `tabulate `_ — allows readable printing of help - `six `_ — makes python2/3 cross-compatibility easier - `tblib `_ — allows me to pass Tracebacks between nodes +- `tqdm `_ — pretty progress bars for multi-job get and wait Cluster Dependencies .................... diff --git a/bin/frun b/bin/frun new file mode 100755 index 0000000..eb0615c --- /dev/null +++ b/bin/frun @@ -0,0 +1,4 @@ +#!/usr/bin/env bash +# Simple alias for `fyrd run` +# Allows the user to run arbitrary shell commands on the cluster +fyrd run $@ diff --git a/bin/fsub b/bin/fsub new file mode 100755 index 0000000..4da83be --- /dev/null +++ b/bin/fsub @@ -0,0 +1,4 @@ +#!/usr/bin/env bash +# Simple alias for `fyrd submit` +# Allows the user to submit existing job files on the cluster +fyrd submit $@ diff --git a/docs/fyrd_manual.pdf b/docs/fyrd_manual.pdf index e8615d8..c08c088 100644 Binary files a/docs/fyrd_manual.pdf and b/docs/fyrd_manual.pdf differ diff --git a/docs/sphinx/adding_batch_systems.rst b/docs/sphinx/adding_batch_systems.rst new file mode 100644 index 0000000..2baaa68 --- /dev/null +++ b/docs/sphinx/adding_batch_systems.rst @@ -0,0 +1 @@ +.. include:: ../../fyrd/batch_systems/README.rst diff --git a/docs/sphinx/api.rst b/docs/sphinx/api.rst index 3909d72..ef3cd81 100644 --- a/docs/sphinx/api.rst +++ b/docs/sphinx/api.rst @@ -35,34 +35,35 @@ Methods .. automethod:: fyrd.queue.Queue.wait -.. automethod:: fyrd.queue.Queue.wait_to_submit +.. automethod:: fyrd.queue.Queue.get -.. automethod:: fyrd.queue.Queue.get_jobs +.. automethod:: fyrd.queue.Queue.wait_to_submit -.. automethod:: fyrd.queue.Queue.update +.. automethod:: fyrd.queue.Queue.test_job_in_queue -.. autoclass:: fyrd.queue.Queue.QueueJob +.. automethod:: fyrd.queue.Queue.get_jobs -.. autoexception:: fyrd.queue.QueueError +.. automethod:: fyrd.queue.Queue.get_user_jobs -fyrd.queue functions -.................... +.. automethod:: fyrd.queue.Queue.update -parsers -~~~~~~~ +.. automethod:: fyrd.queue.Queue.check_dependencies -.. autofunction:: fyrd.queue.queue_parser +fyrd.queue Jobs +................ -.. autofunction:: fyrd.queue.torque_queue_parser +Hold information about individual jobs, `QueueJob` about primary jobs, +`QueueChild` about individual array jobs (which are stored in the `children` +attribute of `QueueJob` objects. -.. autofunction:: fyrd.queue.slurm_queue_parser +.. autoclass:: fyrd.queue.QueueJob -utilities -~~~~~~~~~ +.. autoclass:: fyrd.queue.QueueChild -.. autofunction:: fyrd.queue.get_cluster_environment +fyrd.queue.QueueError +..................... -.. autofunction:: fyrd.queue.check_queue +.. autoexception:: fyrd.queue.QueueError fyrd.job @@ -96,12 +97,24 @@ fyrd.job.Job Methods ~~~~~~~ +.. automethod:: fyrd.job.Job.initialize + +.. automethod:: fyrd.job.Job.gen_scripts + .. automethod:: fyrd.job.Job.write .. automethod:: fyrd.job.Job.clean +.. automethod:: fyrd.job.Job.scrub + .. automethod:: fyrd.job.Job.submit +.. automethod:: fyrd.job.Job.resubmit + +.. automethod:: fyrd.job.Job.get_keywords + +.. automethod:: fyrd.job.Job.set_keywords + .. automethod:: fyrd.job.Job.wait .. automethod:: fyrd.job.Job.get @@ -137,8 +150,22 @@ including writing the files. `Function` is actually a child class of `Script`. :members: :show-inheritance: -fyrd.options ------------- +fyrd.batch_systems +------------------ + +All batch systems are defined here. + +fyrd.batch_systems functions +............................ + +.. autofunction:: fyrd.batch_systems.get_cluster_environment + +.. autofunction:: fyrd.batch_systems.check_queue + +.. autofunction:: fyrd.batch_systems.get_batch_system + +fyrd.batch_systems.options +.......................... All `keyword arguments `_ are defined in dictionaries in the `options.py` file, alongside function to manage those dictionaries. Of @@ -171,17 +198,17 @@ an empty string is returned. whole dictionary of arguments, it explicitly handle arguments that cannot be managed using a simple string format. -.. autofunction:: fyrd.options.option_help +.. autofunction:: fyrd.batch_systems.options.option_help -.. autofunction:: fyrd.options.sanitize_arguments +.. autofunction:: fyrd.batch_systems.options.sanitize_arguments -.. autofunction:: fyrd.options.split_keywords +.. autofunction:: fyrd.batch_systems.options.split_keywords -.. autofunction:: fyrd.options.check_arguments +.. autofunction:: fyrd.batch_systems.options.check_arguments -.. autofunction:: fyrd.options.options_to_string +.. autofunction:: fyrd.batch_systems.options.options_to_string -.. autofunction:: fyrd.options.option_to_string +.. autofunction:: fyrd.batch_systems.options.option_to_string fyrd.conf --------- @@ -309,48 +336,6 @@ from any directory. .. autofunction:: fyrd.basic.clean_dir() -fyrd.local ----------- - -The local queue implementation is based on the multiprocessing library and is -not intended to be used directly, it should always be used via the Job class -because it is somewhat temperamental. The essential idea behind it is that we -can have one JobQueue class that is bound to the parent process, it exclusively -manages a single child thread that runs the `job_runner()` function. The two -process communicate using a `multiprocessing.Queue` object, and pass -`fyrd.local.Job` objects back and forth between them. - -The Job objects (different from the Job objects in `job.py`) contain information -about the task to run, including the number of cores required. The job runner -manages a pool of `multiprocessing.Pool` tasks directly, and keeps the total -running cores below the total allowed (default is the system max, can be set -with the threads keyword). It backfills smaller jobs and holds on to larger jobs -until there is enough space free. - -This is close to what torque and slurm do, but vastly more crude. It serves as a -stopgap to allow parallel software written for compute clusters to run on a -single machine in a similar fashion, without the need for a pipeline alteration. -The reason I have reimplemented a process pool is that I need dependency -tracking and I need to allow some processes to run on multiple cores (e.g. 6 of -the available 24 on the machine). - -The `job_runner()` and `Job` objects should never be accessed except by the -JobQueue. Only one JobQueue should run at a time (not enforced), and by default -it is bound to `fyrd.local.JQUEUE`. That is the interface used by all -other parts of this package. - -fyrd.local.JobQueue -................... - -.. autoclass:: fyrd.local.JobQueue - :members: - :show-inheritance: - -fyrd.local.job_runner -..................... - -.. autofunction:: fyrd.local.job_runner - fyrd.run -------- diff --git a/docs/sphinx/basic_usage.rst b/docs/sphinx/basic_usage.rst index 73a8513..773de22 100644 --- a/docs/sphinx/basic_usage.rst +++ b/docs/sphinx/basic_usage.rst @@ -17,7 +17,7 @@ To run with dependency tracking, run: import fyrd job = fyrd.submit() job2 = fyrd.submit(, depends=job1) - out = job2.get() # Will block until job completes + out1, out2 = fyrd.get([job, job2]) # Will block until job completes The `submit()` function is actually just a wrapper for the `Job `_ class. The same behavior as above can be @@ -36,6 +36,25 @@ can be called on job initialization. Also note that the object returned by calling the `submit()` function (as in the first example) is also a `Job` object, so these two examples can be used fully interchangeably. +Similar wrappers allow you to submit and monitor existing job files, such +as those made by other pipelines: + +.. code:: python + + import os + import fyrd + jobs = [] + job_dir = os.path.abspath('./jobs/') + for job in [os.path.join(job_dir, i) for i in os.listdir(job_dir) if i.endswith('sh')]: + jobs.append(fyrd.submit_file(job)) + fyrd.wait(jobs) # Will block until every job is completed + +This type of thing can also be accomplished using the `console script `_: + +.. code:: shell + + fyrd run --wait ./jobs/*.sh + Functions --------- diff --git a/docs/sphinx/changelog.rst b/docs/sphinx/changelog.rst new file mode 100644 index 0000000..3f56b5b --- /dev/null +++ b/docs/sphinx/changelog.rst @@ -0,0 +1,43 @@ +Change Log +========== + +Version 0.6.2a1 +-------------- + +This version brings a major overhaul to the structure of the code, while leaving the +API *mostly* intact. + +Major Changes +............. + +- Batch system definitions now fully modular and are contained in the `fyrd.batch_systems` + package. `options.py` has also been moved into this package, which allows any programmer + to add a new batch system definition to fyrd by just editing the contents of that small + subpackaged +- Updated console script to allow running arbitrary shell scripts on the console with + `fyrd run` or submitting any number of existing job files using `fyrd sub`. Added the + new alias scripts `frun` and `fsub` for those new modes also. Both new modes will accept + the `--wait` argument, meaning that they will block until the jobs complete. +- Documentation overhauled to update API and add instructions on creating a new batch system, + these instructions are duplicated in the README within the `batch_systems` package folder. +- **Local support temporarily removed**. It didn't work very well, and it broke the new + batch system structure, I hope to add it back again shortly. +- Full support for array job parsing for both torque and slurm. We now create on job entry + for each array job child, instead of for each array job. To manage this, the + `fyrd.queue.Queue.QueueJob` class was moved to `fyrd.queue.QueueJob` and split to add a + child class, `fyrd.queue.QueueChild`. All array jobs not have one `fyrd.queue.QueueJob` + job, plus one `fyrd.queue.QueueChild` job for each of their children, which are stored + in the `children` dictionary in the `fyrd.queue.QueueJob` class. +- Added a `get` method to the `fyrd.queue.Queue` class to allow a user to get outputs from + a list of jobs, loops continuously through the jobs so that jobs are not lost. +- Added `tqdm `_ as a requirement and enabled progressbars + in multi-job wait and get + +Minor Changes +............. + +- Updated the documentation to include this changelog, which will only contain change information + for version 0.6.2a1 onwards. +- Added additional tests to cover the new changes as well as generally increase test suite + coverage. +- Several small bug fixes diff --git a/docs/sphinx/conf.py b/docs/sphinx/conf.py index 5040cb9..7e267d1 100644 --- a/docs/sphinx/conf.py +++ b/docs/sphinx/conf.py @@ -9,7 +9,7 @@ copyright = '2016, Michael Dacre ' author = 'Michael Dacre ' version = '0.6' -release = '0.6.1b9' +release = '0.6.2a1' language = 'en' # Add any paths that contain templates here, relative to this directory. diff --git a/docs/sphinx/console.rst b/docs/sphinx/console.rst index 55142d2..4766f94 100644 --- a/docs/sphinx/console.rst +++ b/docs/sphinx/console.rst @@ -12,11 +12,13 @@ fyrd This software has uses a subcommand system to separate modes, and has six modes: +- `run` - run an arbitrary shell script on the cluster +- `run-job` - run existing cluster script(s) +- `wait` - wait for a list of jobs +- `queue` - show running jobs, makes filtering jobs very easy - `config` — show and edit the contents of the config file - `profile` - inspect and manage cluster profiles - `keywords` - print a list of current keyword arguments with descriptions for each -- `queue` - show running jobs, makes filtering jobs very easy -- `wait` - wait for a list of jobs - `clean` - clean all script and output files in the given directory Several of the commands have aliases (`conf` and `prof` being the two main ones) @@ -24,6 +26,16 @@ Several of the commands have aliases (`conf` and `prof` being the two main ones) Examples ........ +.. code:: shell + + fyrd run 'samtools display big_file.bam | python $HOME/bin/my_parser.py > outfile' + fyrd run --profile long --args walltime=24:00:00,mem=20G --wait \ + 'samtools display big_file.bam | python $HOME/bin/my_parser.py > outfile' + +.. code:: shell + + fyrd run-jobs --wait ./jobs/*.sh + .. code:: shell fyrd prof list @@ -52,7 +64,7 @@ All Options `fyrd`:: - usage: fyrd [-h] [-v] {conf,prof,keywords,queue,wait,clean} ... + usage: fyrd [-h] [-v] {run,submit,wait,queue,conf,prof,keywords,clean} ... Manage fyrd config, profiles, and queue. @@ -60,23 +72,103 @@ All Options Author Michael D Dacre Organization Stanford University License MIT License, use as you wish - Version 0.6.2b9 + Version 0.6.2a1 ============ ====================================== positional arguments: - {conf,prof,keywords,queue,wait,clean} + {run,submit,wait,queue,conf,prof,keywords,clean} + run (r) Run simple shell scripts + submit (sub, s) Submit existing job files + wait (w) Wait for jobs + queue (q) Search the queue conf (config) View and manage the config prof (profile) Manage profiles keywords (keys, options) Print available keyword arguments. - queue (q) Search the queue - wait Wait for jobs clean Clean up a job directory optional arguments: -h, --help show this help message and exit -v, --verbose Show debug outputs +`fyrd run`:: + + usage: fyrd run [-h] [-w] [-p PROFILE] [-a ARGS] + shell_script [shell_script ...] + + Run a shell script on the cluster and optionally wait for completion. + + positional arguments: + shell_script The script to run + + optional arguments: + -h, --help show this help message and exit + -w, --wait Wait for the job to complete + -p PROFILE, --profile PROFILE + The profile to use to run + -a ARGS, --args ARGS Submission args, e.g.: + 'time=00:20:00,mem=20G,cores=10' + +`fyrd run-jobs`:: + + usage: fyrd run-job [-h] [-w] shell_scripts [shell_scripts ...] + + Run a shell script on the cluster and optionally wait for completion. + + positional arguments: + shell_scripts The script to run + + optional arguments: + -h, --help show this help message and exit + -w, --wait Wait for the job to complete + +`fyrd wait`:: + + usage: fyrd wait [-h] [-u USERS] [jobs [jobs ...]] + + Wait on a list of jobs, block until they complete. + + positional arguments: + jobs Job list to wait for + + optional arguments: + -h, --help show this help message and exit + -u USERS, --users USERS + A comma-separated list of users to wait for + +`fyrd queue`:: + + usage: fyrd queue [-h] [-u [...] | -a] [-p [...]] [-r | -q | -d | -b] + [-l | -c] + + Check the local queue, similar to squeue or qstat but simpler, good for + quickly checking the queue. + + By default it searches only your own jobs, pass '--all-users' or + '--users [...]' to change that behavior. + + To just list jobs with some basic info, run with no arguments. + + optional arguments: + -h, --help show this help message and exit + + queue filtering: + -u [ ...], --users [ ...] + Limit to these users + -a, --all-users Display jobs for all users + -p [ ...], --partitions [ ...] + Limit to these partitions (queues) + + queue state filtering: + -r, --running Show only running jobs + -q, --queued Show only queued jobs + -d, --done Show only completed jobs + -b, --bad Show only completed jobs + + display options: + -l, --list Print job numbers only, works well with xargs + -c, --count Print job count only + `fyrd conf`:: usage: fyrd conf [-h] {show,list,help,update,alter,init} ... @@ -163,53 +255,6 @@ All Options -s, --split-tables Print keywords as multiple tables -l, --list Print a list of keywords only -`fyrd queue`:: - - usage: fyrd queue [-h] [-u [...] | -a] [-p [...]] [-r | -q | -d | -b] - [-l | -c] - - Check the local queue, similar to squeue or qstat but simpler, good for - quickly checking the queue. - - By default it searches only your own jobs, pass '--all-users' or - '--users [...]' to change that behavior. - - To just list jobs with some basic info, run with no arguments. - - optional arguments: - -h, --help show this help message and exit - - queue filtering: - -u [ ...], --users [ ...] - Limit to these users - -a, --all-users Display jobs for all users - -p [ ...], --partitions [ ...] - Limit to these partitions (queues) - - queue state filtering: - -r, --running Show only running jobs - -q, --queued Show only queued jobs - -d, --done Show only completed jobs - -b, --bad Show only completed jobs - - display options: - -l, --list Print job numbers only, works well with xargs - -c, --count Print job count only - -`fyrd wait`:: - - usage: fyrd wait [-h] [-u USERS] [jobs [jobs ...]] - - Wait on a list of jobs, block until they complete. - - positional arguments: - jobs Job list to wait for - - optional arguments: - -h, --help show this help message and exit - -u USERS, --users USERS - A comma-separated list of users to wait for - `fyrd clean`:: usage: fyrd clean [-h] [-o] [-s SUFFIX] [-q {torque,slurm,local}] [-n] [dir] @@ -253,6 +298,8 @@ Aliases Several shell scripts are provided in `bin/` to provide shortcuts to the *fyrd* subcommands: +- `frun`: `fyrd run` +- `fsub`: `fyrd submit` - `my-queue` (or `myq`): `fyrd queue` - `clean-job-files`: `fyrd clean` - `monitor-jobs`: `fyrd wait` diff --git a/docs/sphinx/index.rst b/docs/sphinx/index.rst index 57ad90d..ccd9ccc 100644 --- a/docs/sphinx/index.rst +++ b/docs/sphinx/index.rst @@ -8,7 +8,7 @@ Python job submission on torque and slurm clusters with dependency tracking. +---------+----------------------------------------------------+ | License | MIT License, property of Stanford, use as you wish | +---------+----------------------------------------------------+ -| Version | 0.6.1b9 | +| Version | 0.6.2a1 | +---------+----------------------------------------------------+ .. only:: html @@ -57,5 +57,7 @@ Contents: keywords console advanced_usage + adding_batch_systems api + changelog indices diff --git a/docs/sphinx/simple_index.rst b/docs/sphinx/simple_index.rst index d459431..1fc86f1 100644 --- a/docs/sphinx/simple_index.rst +++ b/docs/sphinx/simple_index.rst @@ -7,10 +7,6 @@ Python job submission on torque and slurm clusters with dependency tracking. :alt: fyrd cluster logo— a Saxon shield remeniscent of those used in fyrds :target: https://fyrd.readthedocs.org -Pronounced 'feared' (sort of), Anglo-Saxon for an army, particularly an army of -freemen (an army of nodes). The logo is based on a Saxon shield commonly used in -these fyrds. This library used to be known as 'Python Cluster'. - Allows simple job submission with *dependency tracking and queue waiting* on either torque, slurm, or locally with the multiprocessing module. It uses simple techniques to avoid overwhelming the queue and to catch bugs on the fly. @@ -19,9 +15,21 @@ It is routinely tested on Mac OS and Linux with slurm and torque clusters, or in the absence of a cluster, on Python versions 2.7.10, 2.7.11, 2.7.12, 3.3.0, 3.4.0, 3.5.2, 3.6.2, and 3.7-dev. The full test suite is available in the `tests` folder. -For complete documentation see `the documentation site `_ -and the `Fyrd.pdf `_ document in this repository. +Fyrd is pronounced 'feared' (sort of), it is an Anglo-Saxon term for an army, +particularly an army of freemen (in this case an army of compute nodes). The +logo is based on a Saxon shield commonly used by these groups. This software +was formerly known as 'Python Cluster'. + +The code is hosted at github: +`https://github.com/MikeDacre/fyrd `_ + +To install, use `PyPI `_: +.. code:: shell + + pip install fyrd + fyrd conf init + Contents: .. toctree:: @@ -32,4 +40,6 @@ Contents: keywords_no_table console advanced_usage + adding_batch_systems api + changelog diff --git a/fyrd/__init__.py b/fyrd/__init__.py index b2bb523..75a9c8c 100644 --- a/fyrd/__init__.py +++ b/fyrd/__init__.py @@ -6,20 +6,20 @@ AUTHOR: Michael D Dacre, mike.dacre@gmail.com ORGANIZATION: Stanford University LICENSE: MIT License, property of Stanford, use as you wish - VERSION: 0.6.1b9 + VERSION: 0.6.2a1 CREATED: 2015-12-11 22:19 - Last modified: 2017-08-03 14:35 + Last modified: 2017-08-07 15:28 =============== =================================================== Allows simple job submission with *dependency tracking and queue waiting* with -either torque, slurm, or locally with the multiprocessing module. It uses -simple techiques to avoid overwhelming the queue and to catch bugs on the fly. +either torque or slurm. It uses simple techiques to avoid overwhelming the +queue and to catch bugs on the fly. Setting Environment ------------------- -To set the environement, set queue.MODE to one of ['torque', -'slurm', 'local'], or run get_cluster_environment(). +To set the environement, set batch_systems.MODE to one of the defined batch +systems, or run get_cluster_environment(). Simple Use ---------- @@ -100,11 +100,10 @@ Job Files --------- -All jobs write out a job file before submission, even though this is not -necessary (or useful) with multiprocessing. In local mode, this is a .cluster -file, in slurm is is a .cluster.sbatch and a .cluster.script file, in torque it -is a .cluster.qsub file. 'cluster' is set by the suffix keyword, and can be -overridden. +All jobs write out a job file before submission, these files will end in +., where suffix defaults to 'cluster' and can be set with +the 'suffix' keyword argument and is defined in the batch_systems +config. To change the directory these files are written to, use the 'filedir' keyword argument to Job or submit. *NOTE:* This *must* be accessible to the compute @@ -118,9 +117,7 @@ Dependecy Tracking ------------------ -Dependency tracking is supported in all modes. Local mode uses a unique -queueing system that works similarly to torque and slurm and which is defined -in local.py. +Dependency tracking is supported in all modes. To use dependency tracking in any mode pass a list of job ids to submit or submit_file with the `dependencies` keyword argument. @@ -136,21 +133,15 @@ ---- Full help is available at:: - github.com/MikeDacre/fyrd + https://fyrd.rtfd.io """ import os as _os import signal as _signal import atexit as _atexit # Version Number -__version__ = '0.6.1b9' +__version__ = '0.6.2a1' -################################################# -# Currently configured job submission systems # -################################################# - -ALLOWED_MODES = ['local', 'torque', 'slurm'] -# Current mode held in queue.MODE ################### # House Keeping # @@ -163,22 +154,23 @@ class ClusterError(Exception): pass + ######################################### # Make our functions easily available # ######################################### -from . import local + from . import queue from . import job from . import conf -from . import options from . import helpers +from . import batch_systems from .run import check_pid as _check_pid from .queue import Queue -from .queue import wait -from .queue import check_queue -from .queue import get_cluster_environment +from .batch_systems import check_queue +from .batch_systems import get_cluster_environment +from .batch_systems import options from .job import Job from .basic import submit @@ -186,31 +178,24 @@ class ClusterError(Exception): from .basic import make_job_file from .basic import clean from .basic import clean_dir +from .basic import wait +from .basic import get from .conf import set_profile from .conf import get_profile from .conf import get_profiles -from .options import option_help +option_help = batch_systems.options.option_help + +# import fyrd.batch_system as batch_system -__all__ = ['Job', 'Queue', 'wait', 'submit', 'submit_file', 'make_job_file', - 'clean', 'clean_dir', 'check_queue', 'option_help', 'set_profile', - 'get_profile', 'get_profiles', 'helpers'] +__all__ = ['Job', 'Queue', 'wait', 'get', 'submit', 'submit_file', + 'make_job_file', 'clean', 'clean_dir', 'check_queue', 'option_help', + 'set_profile', 'get_profile', 'get_profiles', 'conf', 'helpers'] ########################## # Set the cluster type # ########################## -queue.MODE = get_cluster_environment() +batch_systems.MODE = get_cluster_environment() check_queue() - - -############################### -# Kill the JobQueue on exit # -############################### -# def _kill_local(): - # if local.JQUEUE and _check_pid(local.JQUEUE.pid): - # local.JQUEUE.terminate() - # # del(local.JQUEUE) - -# _atexit.register(_kill_local) diff --git a/fyrd/__main__.py b/fyrd/__main__.py index 18a42d8..2f8fa68 100644 --- a/fyrd/__main__.py +++ b/fyrd/__main__.py @@ -7,7 +7,7 @@ Author Michael D Dacre Organization Stanford University License MIT License, use as you wish -Version 0.6.2b9 +Version 0.6.2a1 ============ ====================================== """ from __future__ import print_function @@ -24,6 +24,20 @@ # Help Text # ############################################################################### +RUN_HELP = """\ +Run a shell script on the cluster and optionally wait for completion. +""" + +RUN_JOB_HELP = """\ +Run an existing job file or set of files on the cluster and optionally wait +for completion. + +e.g. fyrd run_file --wait ./jobs/*sh +""" + +WAIT_HELP = """\ +Wait on a list of jobs, block until they complete. +""" CONF_HELP = """\ This script allows display and management of the fyrd config file found @@ -92,10 +106,6 @@ To just list jobs with some basic info, run with no arguments. """ -WAIT_HELP = """\ -Wait on a list of jobs, block until they complete. -""" - DEFAULT_CONF_SECTIONS = set(fyrd.conf.DEFAULTS.keys()) DEFAULT_CONF_OPTS = set( chain(*[list(i.keys()) for i in fyrd.conf.DEFAULTS.values()]) @@ -164,9 +174,9 @@ # Catch Keyboard Interruption # ############################################################################### -def catch_keyboard(signal, frame): +def catch_keyboard(sig, frame): """Catch Keyboard Interruption.""" - sys.stderr.write('\nKeyBoard Interrupt Detected, Exiting\n') + sys.stderr.write('\nKeyboard Interrupt Detected, Exiting\n') sys.exit(1) signal.signal(signal.SIGINT, catch_keyboard) @@ -314,13 +324,66 @@ def delete_profile_option(args): Args: args (Namespace): Argparse command line arguments defined in main. - """ for opt in args.options: print('Removing {} from {}'.format(opt, args.section)) fyrd.conf.profiles.remove_option(args.section, opt) print('Done') +################### +# Running stuff # +################### + + +def run(args): + """Run an arbitrary shell script as a job. + + Args: + args (Namespace): Argparse command line arguments defined in main. + """ + kwargs = {} + if args.args: + for arg in args.args.split(','): + sarg = arg.split('=') + if len(sarg) == 1: + kwargs[sarg[0]] = None + elif len(sarg) == 2: + kwargs[sarg[0]] = sarg[1] + else: + raise TypeError( + 'Invalid argument: {arg}, must be key=value, or just key' + ) + + command = ' '.join(args.shell_script) + + job = fyrd.job.Job(command, profile=args.profile, **kwargs).submit() + + print('Job submitted as job {0}'.format(job.id)) + + if args.wait: + print('Waiting for job to complete') + fyrd.wait(job) + + +def sub_files(args): + """Run any number of existing job files and optionally wait for them. + + Args: + args (Namespace): Argparse command line arguments defined in main. + """ + for job in args.shell_scripts: + if not os.path.isfile(job): + sys.stderr.write('Job file {0} does not exist'.format(job)) + sys.exit(1) + job_nos = [] + for job in args.shell_scripts: + job_no = fyrd.basic.submit_file(job) + print('{0}: {1}'.format(job_no, job)) + job_nos.append(job_no) + if args.wait: + print('Waiting for job to complete') + fyrd.basic.wait(job_nos) + ####################### # Other Subcommands # @@ -552,7 +615,113 @@ def command_line_parser(): # Subcommands modes = parser.add_subparsers( - dest='modes', metavar='{conf,prof,keywords,queue,wait,clean}') + dest='modes', + metavar='{run,submit,wait,queue,conf,prof,keywords,clean}' + ) + + ###################### + # Run Shell Script # + ###################### + + run_sub = modes.add_parser( + 'run', description=RUN_HELP, help="Run simple shell scripts", + aliases=['r'], + formatter_class=argparse.RawDescriptionHelpFormatter + ) + + run_sub.add_argument('shell_script', nargs='+', + help="The script to run") + run_sub.add_argument('-w', '--wait', action='store_true', + help='Wait for the job to complete') + run_sub.add_argument('-p', '--profile', + help='The profile to use to run') + run_sub.add_argument('-a', '--args', + help='Submission args, e.g.: ' + + "'time=00:20:00,mem=20G,cores=10'") + + # Set function + run_sub.set_defaults(func=run) + + ######################## + # Run Existing Files # + ######################## + + run_job_sub = modes.add_parser( + 'submit', description=RUN_HELP, help="Submit existing job files", + aliases=['sub', 's'], + formatter_class=argparse.RawDescriptionHelpFormatter + ) + + run_job_sub.add_argument('shell_scripts', nargs='+', + help="The script to run") + run_job_sub.add_argument('-w', '--wait', action='store_true', + help='Wait for the job to complete') + + # Set function + run_job_sub.set_defaults(func=sub_files) + + ################# + # Job Waiting # + ################# + + wait_sub = modes.add_parser( + 'wait', description=WAIT_HELP, help="Wait for jobs", + aliases=['w'], + formatter_class=argparse.RawDescriptionHelpFormatter + ) + + # We use * here instead of + because it is possible to wait on all jobs + # for a user, making the jobs argument unnecessary. + wait_sub.add_argument('jobs', nargs='*', help="Job list to wait for") + wait_sub.add_argument('-u', '--users', + help='A comma-separated list of users to wait for') + + # Set function + wait_sub.set_defaults(func=wait) + + ################### + # Queue Parsing # + ################### + + queue_sub = modes.add_parser( + 'queue', aliases=['q'], + description=QUEUE_HELP, help="Search the queue", + formatter_class=argparse.RawDescriptionHelpFormatter + ) + + # User and partition filtering + queue_filter = queue_sub.add_argument_group('queue filtering') + queue_filter_m = queue_filter.add_mutually_exclusive_group() + queue_filter_m.add_argument('-u', '--users', nargs='+', metavar='', + help='Limit to these users') + queue_filter_m.add_argument('-a', '--all-users', action='store_true', + help='Display jobs for all users') + queue_filter.add_argument('-p', '--partitions', nargs='+', metavar='', + help="Limit to these partitions (queues)") + + # State filtering + queue_filter = queue_sub.add_argument_group('queue state filtering') + queue_filter_s = queue_filter.add_mutually_exclusive_group() + queue_filter_s.add_argument('-r', '--running', action='store_true', + help="Show only running jobs") + queue_filter_s.add_argument('-q', '--queued', action='store_true', + help="Show only queued jobs") + queue_filter_s.add_argument('-d', '--done', action='store_true', + help="Show only completed jobs") + queue_filter_s.add_argument('-b', '--bad', action='store_true', + help="Show only completed jobs") + + # Display mode + queue_disp_group = queue_sub.add_argument_group('display options') + queue_disp = queue_disp_group.add_mutually_exclusive_group() + queue_disp.add_argument('-l', '--list', action='store_true', + help="Print job numbers only, works well with " + + "xargs") + queue_disp.add_argument('-c', '--count', action='store_true', + help="Print job count only") + + # Set function + queue_sub.set_defaults(func=queue) ######################### # Config Manipulation # @@ -679,66 +848,6 @@ def command_line_parser(): help="Print a list of keywords only") keywords.set_defaults(func=keyword_help) - ################### - # Queue Parsing # - ################### - - queue_sub = modes.add_parser( - 'queue', aliases=['q'], - description=QUEUE_HELP, help="Search the queue", - formatter_class=argparse.RawDescriptionHelpFormatter - ) - - # User and partition filtering - queue_filter = queue_sub.add_argument_group('queue filtering') - queue_filter_m = queue_filter.add_mutually_exclusive_group() - queue_filter_m.add_argument('-u', '--users', nargs='+', metavar='', - help='Limit to these users') - queue_filter_m.add_argument('-a', '--all-users', action='store_true', - help='Display jobs for all users') - queue_filter.add_argument('-p', '--partitions', nargs='+', metavar='', - help="Limit to these partitions (queues)") - - # State filtering - queue_filter = queue_sub.add_argument_group('queue state filtering') - queue_filter_s = queue_filter.add_mutually_exclusive_group() - queue_filter_s.add_argument('-r', '--running', action='store_true', - help="Show only running jobs") - queue_filter_s.add_argument('-q', '--queued', action='store_true', - help="Show only queued jobs") - queue_filter_s.add_argument('-d', '--done', action='store_true', - help="Show only completed jobs") - queue_filter_s.add_argument('-b', '--bad', action='store_true', - help="Show only completed jobs") - - # Display mode - queue_disp_group = queue_sub.add_argument_group('display options') - queue_disp = queue_disp_group.add_mutually_exclusive_group() - queue_disp.add_argument('-l', '--list', action='store_true', - help="Print job numbers only, works well with " + - "xargs") - queue_disp.add_argument('-c', '--count', action='store_true', - help="Print job count only") - - # Set function - queue_sub.set_defaults(func=queue) - - ################# - # Job Waiting # - ################# - - wait_sub = modes.add_parser( - 'wait', description=WAIT_HELP, help="Wait for jobs", - formatter_class=argparse.RawDescriptionHelpFormatter - ) - - wait_sub.add_argument('jobs', nargs='*', help="Job list to wait for") - wait_sub.add_argument('-u', '--users', - help='A comma-separated list of users to wait for') - - # Set function - wait_sub.set_defaults(func=wait) - ######################## # Directory Cleaning # ######################## diff --git a/fyrd/basic.py b/fyrd/basic.py index 6bc4257..c43f65b 100644 --- a/fyrd/basic.py +++ b/fyrd/basic.py @@ -15,8 +15,8 @@ from . import run as _run from . import conf as _conf from . import queue as _queue -from . import local as _local from . import logme as _logme +from . import batch_systems as _batch from . import ClusterError as _ClusterError from .job import Job @@ -53,7 +53,7 @@ def submit(command, args=None, kwargs=None, name=None, qtype=None, Job object """ - _queue.check_queue() # Make sure the queue.MODE is usable + _batch.check_queue() # Make sure the queue.MODE is usable job = Job(command=command, args=args, kwargs=kwargs, name=name, qtype=qtype, profile=profile, **kwds) @@ -74,8 +74,6 @@ def make_job(command, args=None, kwargs=None, name=None, qtype=None, profile=None, **kwds): """Make a job file compatible with the chosen cluster. - If mode is local, this is just a simple shell script. - Args: command (function/str): The command or function to execute. args (tuple/dict): Optional arguments to add to command, @@ -98,7 +96,7 @@ def make_job(command, args=None, kwargs=None, name=None, qtype=None, Job object """ - _queue.check_queue() # Make sure the queue.MODE is usable + _batch.check_queue() # Make sure the queue.MODE is usable job = Job(command=command, args=args, kwargs=kwargs, name=name, qtype=qtype, profile=profile, **kwds) @@ -133,7 +131,7 @@ def make_job_file(command, args=None, kwargs=None, name=None, qtype=None, Job object """ - _queue.check_queue() # Make sure the queue.MODE is usable + _batch.check_queue() # Make sure the queue.MODE is usable job = Job(command=command, args=args, kwargs=kwargs, name=name, qtype=qtype, profile=profile, **kwds) @@ -141,7 +139,7 @@ def make_job_file(command, args=None, kwargs=None, name=None, qtype=None, job = job.write() # Return the path to the script - return job.submission + return job.submission.file_name ############## @@ -149,14 +147,15 @@ def make_job_file(command, args=None, kwargs=None, name=None, qtype=None, ############## -def clean(jobs): - """Delete all files in jobs list or single Job object.""" - if isinstance(jobs, Job): - jobs = [jobs] - if not isinstance(jobs, (list, tuple)): - raise _ClusterError('Job list must be a Job, list, or tuple') +def clean(jobs, clean_outputs=False): + """Delete all files in jobs list or single Job object. + + Attributes: + clean_outputs (bool): Also clean outputs. + """ + jobs = _run.listify(jobs) for job in jobs: - job.clean() + job.clean(delete_outputs=clean_outputs) ############################################################################### @@ -168,7 +167,6 @@ def submit_file(script_file, dependencies=None, threads=None, qtype=None): """Submit a job file to the cluster. If qtype or queue.MODE is torque, qsub is used; if it is slurm, sbatch - is used; if it is local, the file is executed with subprocess. This function is independent of the Job object and just submits a file. @@ -177,96 +175,18 @@ def submit_file(script_file, dependencies=None, threads=None, qtype=None): In slurm: `--dependency=afterok:` is used For torque: `-W depend=afterok:` is used - threads: Total number of threads to use at a time, defaults to all. + threads: Total number of threads to use at a time, defaults to all ONLY USED IN LOCAL MODE Returns: - job number for torque or slurm multiprocessing job object for local - mode + job number """ - _queue.check_queue() # Make sure the queue.MODE is usable - - if not qtype: - qtype = _queue.get_cluster_environment() - - # Check dependencies - if dependencies: - if isinstance(dependencies, (str, int)): - dependencies = [dependencies] - if not isinstance(dependencies, (list, tuple)): - raise Exception('dependencies must be a list, int, or string.') - dependencies = [str(i) for i in dependencies] - - if qtype == 'slurm': - if dependencies: - dependencies = '--dependency=afterok:{}'.format( - ':'.join([str(d) for d in dependencies])) - args = ['sbatch', dependencies, script_file] - else: - args = ['sbatch', script_file] - # Try to submit job 5 times - count = 0 - while True: - code, stdout, stderr = _run.cmd(args) - if code == 0: - job = int(stdout.split(' ')[-1]) - break - else: - if count == 5: - _logme.log('sbatch failed with code {}\n'.format(code), - 'stdout: {}\nstderr: {}'.format(stdout, stderr), - 'critical') - raise _CalledProcessError(code, args, stdout, stderr) - _logme.log('sbatch failed with err {}. Resubmitting.'.format( - stderr), 'debug') - count += 1 - _sleep(1) - continue - break - return job - - elif qtype == 'torque': - if dependencies: - dependencies = '-W depend={}'.format( - ','.join(['afterok:' + d for d in dependencies])) - args = ['qsub', dependencies, script_file] - else: - args = ['qsub', script_file] - # Try to submit job 5 times - count = 0 - while True: - code, stdout, stderr = _run.cmd(args) - if code == 0: - job = int(stdout.split('.')[0]) - break - else: - if count == 5: - _logme.log('qsub failed with code {}\n'.format(code), - 'stdout: {}\nstderr: {}'.format(stdout, stderr), - 'critical') - raise _CalledProcessError(code, args, stdout, stderr) - _logme.log('qsub failed with err {}. Resubmitting.'.format( - stderr), 'debug') - count += 1 - _sleep(1) - continue - break - return job - - elif qtype == 'local': - # Normal mode dependency tracking uses only integer job numbers - depends = [] - if dependencies: - for depend in dependencies: - if isinstance(depend, Job): - depends.append(int(depend.id)) - else: - depends.append(int(depend)) - command = 'bash {}'.format(script_file) - # Make sure the global job pool exists - if not _local.JQUEUE or not _local.JQUEUE.runner.is_alive(): - _local.JQUEUE = _local.JobQueue(cores=threads) - return _local.JQUEUE.add(_run.cmd, (command,), dependencies=depends) + qtype = qtype if qtype else _batch.get_cluster_environment() + _batch.check_queue(qtype) + dependencies = _run.listify(dependencies) + + batch = _batch.get_batch_system(qtype) + return batch.submit(script_file) def clean_work_dirs(outputs=False, confirm=False): @@ -316,7 +236,7 @@ def clean_dir(directory=None, suffix=None, qtype=None, confirm=False, Returns: A set of deleted files """ - _queue.check_queue(qtype) # Make sure the queue.MODE is usable + _batch.check_queue(qtype) # Make sure the queue.MODE is usable if delete_outputs is None: delete_outputs = _conf.get_option('jobs', 'clean_outputs') @@ -388,3 +308,20 @@ def clean_dir(directory=None, suffix=None, qtype=None, confirm=False, _sys.stdout.write('Done\n') return deleted + + +############################################################################### +# Simple Wrapper to Wait on Queue and Get Outputs # +############################################################################### + + +def wait(jobs): + """Simple wrapper for Queue.wait().""" + q = _queue.Queue() + return q.wait(jobs) + + +def get(jobs): + """Simple wrapper for Queue.wait().""" + q = _queue.Queue() + return q.get(jobs) diff --git a/fyrd/batch_systems/README.rst b/fyrd/batch_systems/README.rst new file mode 100644 index 0000000..bda4ad1 --- /dev/null +++ b/fyrd/batch_systems/README.rst @@ -0,0 +1,257 @@ +Adding Batch Systems +==================== + +Fyrd is intended to be fully modular, meaning anyone should be able to +implement support for any batch system, even other remote submission systems +like DistributedPython *if* they are able to define the following functions and +options. + +To add a new batch system, you will need to: + +1. Edit `__init__.py` to: + 1. Update `DEFINED_SYSTEMS` to include your batch system + 2. Edit `get_cluster_environment()` to detect your batch system, this function + is ordered, meaning that it checks for slurm before torque, as slurm + implements torque aliases. You should add a sensible way of detecting your + batch system here. +2. Create a file in this directory with the name of your batch system (must match + the name in `DEFINED_SYSTEMS`). This file must contain all constants and functions + described below in the `Batch Script <#Batch_Script>`_ section. +3. Edit `options.py` as described below in the `Options <#Options>`_ section. +4. Run the pyenv test suite on your cluster system and make sure all tests pass + on all versions of python supported by fyrd on your cluster system. +5. Optionally add a buildkite script on your cluster to allow CI testing. Note, + this will technically give anyone with push privileges (i.e. me) the ability + to execute code on your server. I promise to do no evil, but I can understand + a degree of uncertainty regarding that. However, using buildkite will allow us + to make sure that future updates don't break support for your batch system. +6. Become a fyrd maintainer! I always need help, if you want to contribute more, + please do :-) + +Options +------- + +Fyrd works primarily by converting batch system arguments (e.g. `--queue` +for torque and `--partition` for slurm) into python keyword arguments. This is +done by creating dictionaries in the `fyrd/batch_systems/options.py` file. + +Option parsing is done on job creation by calling the +`options.options_to_string()` function on the user provided keyword arguments. +The primary point of this function is to convert all keyword arguments to +string forms that can go at the top of your batch file prior to cluster +submission. Therefore you *must* edit the dictionaries in `options.py` to +include your batch system definitions. The most important section to edit is +`CLUSTER_CORE`, this dictionary has sections for each batch system, e.g. for +walltime:: + + ('time', + {'help': 'Walltime in HH:MM:SS', + 'default': '12:00:00', 'type': str, + 'slurm': '--time={}', 'torque': '-l walltime={}'}), + +This auto-converts the time argument provided by the user into `--time` for slurm +and `-l walltime=` for torque. + +As all systems are a little different, `options.options_to_string()` first +calls the `parse_strange_options()` function in the batch system definition +script to allow you the option to manually parse all options that cannot be +handled so simply. Hopefully this function will do nothing, but return the input, +but in some cases it makes sense for this function to handle every argument, an +obvious example is when running using something like `multiprocessing` instead +of a true batch system. + +Batch Script +............ + +The defined batch script must have the name of your system and must define the +following constants and functions in exactly the way described below. Your +functions can do anything you want, and you can have extra functions in your +file (maybe make them private with a leading `_` in the name), but the primary +functions must take exactly the same arguments as those described below, and +provide exactly the same return values. + +Constants +......... + +- `PREFIX`: The string that will go before options at the top of a script file, + could be blank for simple shell scripts, for slurm is is `'#SBATCH'` + +Functions +......... + +normalize_job_id(job_id) +~~~~~~~~~~~~~~~~~~~~~~~~ + +Input: + +- job_id: string, return value from job submission + +Output: + +- job_id: string, a normalized job id +- array_id: string or None, a normalized array job id + +Description: + +Take a string returned by your job submission script (e.g. `qsub`) and turn it +into a normalized (hopefully string version of an int) job ID or process ID and +an array_id, if that is implemented by your system. The array_id can be None if +not implemented and should be None if not present (i.e. the job is not an array +job). + +normalize_state(state) +~~~~~~~~~~~~~~~~~~~~~~ + +Input: + +- state: string, a state description from the queue, e.g. 'running', or 'R' + +Output: + +- state: string, a state normalized into one of: + - 'completed', + - 'completing' + - 'held' + - 'pending' + - 'running' + - 'suspended' + - 'running' + - 'suspended' + +gen_scripts(job_object, command, args, precmd, modstr) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Input: + +- job_object: Job, a `fyrd.job.Job` object for the current job +- command: string, a string of the command to be run +- args: any additional arguments that are to be submitted, generally not used +- precmd: string, the batch system directives created by `options_to_string`, + you can edit this or overwrite it if necessary +- modstr: string, a string of module imports (e.g. module load samtools) set by + the user + +Output: + +- submission_script: `fyrd.submission_scripts.Script` object with the script to + run +- exec_script: `fyrd.submission_scripts.Script` object with an additional script + called by submission script if necessary, can be None + +Description: + +This is one of the more complex functions, but essentially you are going to just +format the `fyrd.script_runners.CMND_RUNNER_TRACK` script using the objects in the +inputs. This just makes an executable submission script, so you can build this +anyway you want, you don't have to use the `CMND_RUNNER_TRACK` script. However, +if you make your own script, the STDOUT must include timestamps like this:: + + date +'%y-%m-%d-%H:%M:%S' + echo "Running {name}" + {command} + exitcode=$? + echo Done + date +'%y-%m-%d-%H:%M:%S' + if [[ $exitcode != 0 ]]; then + echo Exited with code: $exitcode >&2 + fi + exit $exitcode + +This is because we parse the first two and last 2/3 lines of the file to get the +job runtimes and exit codes. + +Here is an example function: + +.. code:: python + def gen_scripts(job_object, command, args, precmd, modstr): + """Create script object for job, does not create a sep. exec script.""" + scrpt = _os.path.join(job_object.scriptpath, + '{}.cluster.qsub'.format(job_object.name)) + + sub_script = _scrpts.CMND_RUNNER_TRACK.format( + precmd=precmd, usedir=job_object.runpath, name=job_object.name, + command=command + ) + return _Script(script=sub_script, file_name=scrpt), None + +submit(file_name, dependencies=None) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Input: + +- file_name: string, The path to the file to execute +- dependencies: list, A list of dependencies (job objects or job numbers) + +Output: + +- job_id: string, A job number + +Description: + +This function must actually submit the job file, however you want it to. If +possible, include dependency tracking, if that isn't possible, raise a +NotImplemented Exception. You can make use of `fyrd.run.cmd`, which allows you +to execute code directly on the terminal and can catch errors and retry submission +however many times you choose (5 is a good number). It also returns the exit_code, +STDOUT, and STDERR for the execution. + +Please add as much error catching code as possible here, the `torque.py` example +is a good one. + +queue_parser(user=None, partition=None) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Input: + +- user: string, optional username to limit to +- partition: string, optional partition/queue to limit to + +(Fine to ignore these arguments if they are not implemented on your system) + +Yields (must be an iterator): + +- job_id: string +- array_id: string, optional array job number +- name: string, a name for the job +- userid: string, user of the job (can be None) +- partition: string, partition running in (can be None) +- state: string a slurm-style string representation of the state +- nodelist: list, the nodes the job is running on +- numnodes: int, a count of the number of nodes +- threads_per_node: int, a count of the number of cores being used on each node +- exit_code: int, an exit_code (can be None if not exited yet) **Must** be an int + if state == 'completed'. **must** be 0 if job completed successfully. + +Description: + +This is the iterator that is the core of the batch system definition. You must +somehow be able to parse all of the currently running jobs and return the above +information about every job. *If your batch system implements array jobs, this +generator must yield one entry per array child, not parent job*. + +parse_strange_options(option_dict) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Inputs: + +- option_dict: dictionary, a dictionary of keywords from the `options.py` file + prior to interpretation with `option_to_string`, allowing parsing of all + unusual keywords. + +Outputs: + +- outlist: list, A list of **strings** that will be added to the top of the submit + file +- option_dict: dictionary, A parsed version of option_dict with **all options not + defined in the appropriate dictionaries in `options.py` removed**. + +Summary +------- + +The modularity of this system is intended to make it easy to support any batch +system, however it is possible that some systems won't fit into the mold defined +here. If that is the case, feel free to alter other parts of the code to make it +work, but **be sure that all tests run successfully on every defined cluster on +every supported version of python**. Feel free to reach out to me to request +testing if you do not have access to any system. + diff --git a/fyrd/batch_systems/__init__.py b/fyrd/batch_systems/__init__.py new file mode 100644 index 0000000..b0f779d --- /dev/null +++ b/fyrd/batch_systems/__init__.py @@ -0,0 +1,114 @@ +# -*- coding: utf-8 -*- +""" +Modular batch system handling. + +All batch system specific functions are contained within files in the +batch_systems folder. The files must have the same name as the batch system, +and possible batch systems are set in the DEFINED_SYSTEMS set. Most batch +system functions are set in the modules in this package, but system detection +methods are hardcoded into get_cluster_environment() also. + +To add new systems, create a new batch system with identical function and +classes names and return/yield values to those in an existing definition. You +will also need to update the options.py script to include keywords for your +system and the get_cluster_environment() function to include autodetection. +""" +from importlib import import_module as _import + +from .. import run as _run +from .. import logme as _logme +from .. import ClusterError as _ClusterError + +DEFINED_SYSTEMS = {'torque', 'slurm', 'local'} + +# This is set in the get_cluster_environment() function. +MODE = '' + + +def get_batch_system(qtype=None): + """Return a batch_system module.""" + qtype = qtype if qtype else get_cluster_environment() + if qtype not in DEFINED_SYSTEMS: + raise _ClusterError( + 'qtype value {} is not recognized, '.format(qtype) + + 'should be: local, torque, or slurm' + ) + return _import('fyrd.batch_systems.{}'.format(qtype)) + + +################################# +# Set the global cluster type # +################################# + + +def get_cluster_environment(): + """Detect the local cluster environment and set MODE globally. + + Detect the current batch system by looking for command line utilities. + Order is important here, so we hard code the batch system lookups. + + Paths to files can also be set in the config file. + + Returns: + tuple: MODE variable ('torque', 'slurm', or 'local') + """ + global MODE + from .. import conf as _conf + conf_queue = _conf.get_option('queue', 'queue_type', 'auto') + if conf_queue not in list(DEFINED_SYSTEMS) + ['auto']: + _logme.log('queue_type in the config file is {}, '.format(conf_queue) + + 'but it should be one of {}'.format(DEFINED_SYSTEMS) + + ' or auto. Resetting it to auto', 'warn') + _conf.set_option('queue', 'queue_type', 'auto') + conf_queue = 'auto' + if conf_queue == 'auto': + # Hardcode queue lookups here + sbatch_cmnd = _conf.get_option('queue', 'sbatch') + qsub_cmnd = _conf.get_option('queue', 'qsub') + sbatch_cmnd = sbatch_cmnd if sbatch_cmnd else 'sbatch' + qsub_cmnd = qsub_cmnd if qsub_cmnd else 'qsub' + if _run.which(sbatch_cmnd): + MODE = 'slurm' + elif _run.which(qsub_cmnd): + MODE = 'torque' + else: + MODE = 'local' + else: + MODE = conf_queue + if MODE == 'local': + _logme.log('No cluster environment detected, using multiprocessing', + 'debug') + else: + _logme.log('{} detected, using for cluster submissions'.format(MODE), + 'debug') + return MODE + + +############################## +# Check if queue is usable # +############################## + + +def check_queue(qtype=None): + """Raise exception if MODE is incorrect.""" + if 'MODE' not in globals(): + global MODE + MODE = get_cluster_environment() + if not MODE: + MODE = get_cluster_environment() + if qtype: + if qtype not in DEFINED_SYSTEMS: + raise _ClusterError('qtype value {} is not recognized, ' + .format(qtype) + + 'should be one of {}'.format(DEFINED_SYSTEMS)) + else: + if MODE not in DEFINED_SYSTEMS: + MODE = qtype + return True + elif MODE not in DEFINED_SYSTEMS: + raise _ClusterError('MODE value {} is not recognized, '.format(MODE) + + 'should be: local, torque, or slurm') + + +# Make options easily available everywhere +from . import options diff --git a/fyrd/options.py b/fyrd/batch_systems/options.py similarity index 90% rename from fyrd/options.py rename to fyrd/batch_systems/options.py index 1b40e6a..8f31b0d 100644 --- a/fyrd/options.py +++ b/fyrd/batch_systems/options.py @@ -14,7 +14,7 @@ All of these fields are required except in the case that: 1. The option is managed in options_to_string explicitly - 2. The option is in NORMAL, TORQUE, or SLURM dictionaries, in which case + 2. The option is in TORQUE or SLURM dictionaries, in which case flags used by other queue systems can be skipped. """ import os as _os @@ -27,9 +27,12 @@ from six import reraise as _raise from tabulate import tabulate as _tabulate -from . import run -from . import logme -from . import ClusterError +from .. import logme +from .. import ClusterError + +from . import MODE +from . import get_batch_system +from . import check_queue __all__ = ['option_help'] @@ -92,14 +95,7 @@ 'slurm': '-e {}', 'torque': '-e {}'}), ]) -# Options used in only local runs -NORMAL = _OD([ - ('threads', - {'help': 'Number of threads to use on the local machine', - 'default': 4, 'type': int}), -]) - -# Options used in both torque and slurm +# Options used in all batch systems CLUSTER_CORE = _OD([ ('nodes', {'help': 'Number of nodes to request', @@ -177,6 +173,7 @@ ('queue', 'partition'), ('memory', 'mem'), ('cpus', 'cores'), + ('threads', 'cores'), ('walltime', 'time'), ('delete_files', 'clean_files'), ('delete_outputs', 'clean_outputs'), @@ -217,12 +214,13 @@ CLUSTER_KWDS = SLURM_KWDS.copy() CLUSTER_KWDS.update(TORQUE_KWDS) -NORMAL_KWDS = COMMON.copy() -for kds in [NORMAL]: - NORMAL_KWDS.update(kds) +# Should include the above in a dictionary by qtype +BATCH_KWDS = { + 'slurm': SLURM_KWDS, + 'torque': TORQUE_KWDS, +} ALL_KWDS = CLUSTER_KWDS.copy() -ALL_KWDS.update(NORMAL_KWDS) # Will be 'name' -> type ALLOWED_KWDS = _OD() @@ -370,7 +368,7 @@ def check_arguments(kwargs): err = list(_sys.exc_info()) err[1] = ValueError(memerror) _raise(*err) - if len(list(groups)) != 0 or not svalk or sunitk: + if list(groups) or not svalk or sunitk: raise ValueError(memerror) if sunit == 'b': opt = int(float(sval)/float(1024)/float(1024)) @@ -400,15 +398,13 @@ def option_to_string(option, value=None, qtype=None): Args: option: An allowed option definied in options.all_options value: A value for that option if required (if None, default used) - qtype: 'torque', 'slurm', or 'local': override queue.MODE + qtype: One of the defined batch systems Returns: str: A string with the appropriate flags for the active queue. """ # Import a couple of queue functions here - from . import queue - qtype = qtype if qtype else queue.MODE - queue.check_queue(qtype) + qtype = qtype if qtype else MODE if isinstance(option, dict): raise ValueError('Arguments to option_to_string cannot be ' @@ -420,15 +416,7 @@ def option_to_string(option, value=None, qtype=None): raise OptionsError('Cannot handle cores or nodes here, use ' + 'options_to_string') - if qtype == 'slurm': - kwds = SLURM_KWDS - elif qtype == 'torque': - kwds = TORQUE_KWDS - elif qtype == 'local': - return '' # There is no need of this in local mode - else: - # This should never happen - raise ClusterError('Invalid qtype {}'.format(qtype)) + kwds = BATCH_KWDS[qtype] # Make sure argument allowed option, value = list(check_arguments({option: value}).items())[0] @@ -450,7 +438,7 @@ def option_to_string(option, value=None, qtype=None): raise OptionsError('{} requires a value'.format(option)) # Return formatted string - prefix = '#SBATCH' if qtype == 'slurm' else '#PBS' + prefix = get_batch_system(qtype).PREFIX if '{}' in kwds[option][qtype]: if value is None: return '' @@ -462,18 +450,23 @@ def option_to_string(option, value=None, qtype=None): def options_to_string(option_dict, qtype=None): - """Return a multi-line string for slurm or torque job submission. + """Return a multi-line string for job submission. + + This function pre-parses options and then passes them to the + parse_strange_options function of each batch system, before using the + option_to_string function to parse the remaining options. Args: option_dict (dict): Dict in format {option: value} where value can be None. If value is None, default used. - qtype (str): 'torque', 'slurm', or 'local': override queue.MODE + qtype (str): The defined batch system Returns: str: A multi-line string of torque or slurm options. """ - # Import a couple of queue functions here - from . import queue + qtype = qtype if qtype else MODE + batch = get_batch_system(qtype) + check_queue(qtype) # Sanitize arguments if not isinstance(option_dict, dict): @@ -482,16 +475,8 @@ def options_to_string(option_dict, qtype=None): option_dict = check_arguments(option_dict.copy()) - qtype = qtype if qtype else queue.MODE - - queue.check_queue(qtype) - outlist = [] - # Handle cores separately - nodes = int(option_dict.pop('nodes')) if 'nodes' in option_dict else 1 - cores = int(option_dict.pop('cores')) if 'cores' in option_dict else 1 - # Set path if required if 'filepath' in option_dict: filepath = _os.path.abspath(option_dict.pop('filepath')) @@ -502,19 +487,9 @@ def options_to_string(option_dict, qtype=None): option_dict['errfile'] = _os.path.join( filepath, _os.path.basename(option_dict['errfile'])) - if qtype == 'slurm': - outlist.append('#SBATCH --ntasks {}'.format(nodes)) - outlist.append('#SBATCH --cpus-per-task {}'.format(cores)) - elif qtype == 'torque': - outstring = '#PBS -l nodes={}:ppn={}'.format(nodes, cores) - if 'features' in option_dict: - outstring += ':' + ':'.join( - run.opt_split(option_dict.pop('features'), (',', ':'))) - if 'qos' in option_dict: - outstring += ',qos={}'.format(option_dict.pop('qos')) - outlist.append(outstring) - - # Loop through all options + outlist, option_dict = batch.parse_strange_options(option_dict) + + # Loop through all remaining options for option, value in option_dict.items(): outlist.append(option_to_string(option, value, qtype)) @@ -556,11 +531,6 @@ def option_help(mode='string', qtype=None, tablefmt='simple'): 'help': _OD([('imports', impts)]), } - hlp['local'] = { - 'summary': 'Used only in local mode', - 'help': NORMAL, - } - # Include all cluster options in one cluster = CLUSTER_CORE.copy() cluster.update(CLUSTER_OPTS) @@ -582,16 +552,12 @@ def option_help(mode='string', qtype=None, tablefmt='simple'): } if qtype: - if qtype == 'local': - hlp.pop('cluster') - hlp.pop('torque') - hlp.pop('slurm') - elif qtype == 'slurm': + if qtype == 'slurm': hlp.pop('torque') elif qtype == 'torque': hlp.pop('slurm') else: - raise ClusterError('qtype must be "torque", "slurm", or "local"') + raise ClusterError('qtype must be "torque", "slurm"') if mode == 'print' or mode == 'string': outstr = '' diff --git a/fyrd/batch_systems/slurm.py b/fyrd/batch_systems/slurm.py new file mode 100644 index 0000000..59c0aab --- /dev/null +++ b/fyrd/batch_systems/slurm.py @@ -0,0 +1,272 @@ +# -*- coding: utf-8 -*- +""" +SLURM parsing functions. +""" +import os as _os +import re as _re +import sys as _sys +import pwd as _pwd # Used to get usernames for queue +from subprocess import CalledProcessError as _CalledProcessError + +from .. import run as _run +from .. import logme as _logme +from .. import ClusterError as _ClusterError +from .. import script_runners as _scrpts +from .. import submission_scripts as _sscrpt +_Script = _sscrpt.Script + + +PREFIX = '#SBATCH' + + +############################################################################### +# Normalization Functions # +############################################################################### + + +def normalize_job_id(job_id): + """Convert the job id into job_id, array_id.""" + if '_' in job_id: + job_id, array_id = job_id.split('_') + job_id = job_id.strip() + array_id = array_id.strip() + else: + array_id = None + return job_id, array_id + + +def normalize_state(state): + """Convert state into standadized (slurm style) state.""" + return state + + +############################################################################### +# Job Submission # +############################################################################### + + +def gen_scripts(job_object, command, args, precmd, modstr): + """Build the submission script objects. + + Creates an exec script as well as a submission script. + """ + scrpt = _os.path.join( + job_object.scriptpath, '{}.{}.sbatch'.format( + job_object.name, job_object.suffix + ) + ) + + # We use a separate script and a single srun command to avoid + # issues with multiple threads running at once + exec_script = _os.path.join( + job_object.scriptpath, '{}.{}.script'.format( + job_object.name, job_object.suffix + ) + ) + exe_script = _scrpts.CMND_RUNNER_TRACK.format( + precmd=modstr, usedir=job_object.runpath, name=job_object.name, + command=command + ) + + # Create the exec_script Script object + exec_script_obj = _Script( + script=exe_script, file_name=exec_script + ) + + ecmnd = 'srun bash {}'.format(exec_script) + sub_script = _scrpts.SCRP_RUNNER.format( + precmd=precmd, script=exec_script, command=ecmnd + ) + + submission_script = _Script(script=sub_script, file_name=scrpt) + + return submission_script, exec_script_obj + + +def submit(file_name, dependencies=None): + """Submit any file with dependencies to Torque. + + Attributes: + file_name (str): Path to an existing file + dependencies (list): List of dependencies + + Returns: + job_id (str) + """ + _logme.log('Submitting to slurm', 'debug') + if dependencies: + deps = '--dependency=afterok:{}'.format( + ':'.join([str(d) for d in dependencies])) + args = ['sbatch', deps, file_name] + else: + args = ['sbatch', file_name] + + # Try to submit job 5 times + code, stdout, stderr = _run.cmd(args, tries=5) + if code == 0: + job_id, _ = normalize_job_id(stdout.split(' ')[-1]) + else: + _logme.log('sbatch failed with code {}\n'.format(code) + + 'stdout: {}\nstderr: {}'.format(stdout, stderr), + 'critical') + raise _CalledProcessError(code, args, stdout, stderr) + + return job_id + + +############################################################################### +# Queue Parsing # +############################################################################### + + +def queue_parser(user=None, partition=None): + """Iterator for slurm queues. + + Use the `squeue -O` command to get standard data across implementation, + supplement this data with the results of `sacct`. sacct returns data only + for the current user but retains a much longer job history. Only jobs not + returned by squeue are added with sacct, and they are added to *the end* of + the returned queue, i.e. *out of order with respect to the actual queue*. + + Args: + user: optional user name to filter queue with + partition: optional partition to filter queue with + + Yields: + tuple: job_id, name, userid, partition, state, nodelist, numnodes, + ntpernode, exit_code + """ + nodequery = _re.compile(r'([^\[,]+)(\[[^\[]+\])?') + qargs = ['squeue', '-h', '-O', + 'jobid:400,arraytaskid:400,name:400,userid:400,partition:400,' + + 'state:400,nodelist:400,numnodes:400,numcpus:400,exit_code:400'] + # Parse queue info by length + squeue = [ + tuple( + [k[i:i+200].rstrip() for i in range(0, 4000, 400)] + ) for k in _run.cmd(qargs)[1].split('\n') + ] + # SLURM sometimes clears the queue extremely fast, so we use sacct + # to get old jobs by the current user + qargs = ['sacct', '-p', + '--format=jobid,jobname,user,partition,state,' + + 'nodelist,reqnodes,ncpus,exitcode'] + try: + sacct = [tuple(i.strip(' |').split('|')) for i in + _run.cmd(qargs)[1].split('\n')] + sacct = sacct[1:] + # This command isn't super stable and we don't care that much, so I will + # just let it die no matter what + except Exception as e: + if _logme.MIN_LEVEL == 'debug': + raise e + else: + sacct = [] + + if sacct: + if len(sacct[0]) != 9: + _logme.log('sacct parsing failed unexpectedly as there are not ' + + '9 columns, aborting.', 'critical') + raise ValueError('sacct output does not have 9 columns. Has:' + + '{}: {}'.format(len(sacct[0]), sacct[0])) + jobids = [i[0] for i in squeue] + for sinfo in sacct: + # Skip job steps, only index whole jobs + if '.' in sinfo[0]: + _logme.log('Skipping {} '.format(sinfo[0]) + + "in sacct processing as it is a job part.", + 'verbose') + continue + # These are the values I expect + try: + [sid, sname, suser, spartition, sstate, + snodelist, snodes, scpus, scode] = sinfo + sid, sarr = normalize_job_id(sid) + except ValueError as err: + _logme.log('sacct parsing failed with error {} '.format(err) + + 'due to an incorrect number of entries.\n' + + 'Contents of sinfo:\n{}\n'.format(sinfo) + + 'Expected 9 values\n:' + + '[sid, sname, suser, spartition, sstate, ' + + 'snodelist, snodes, scpus, scode]', + 'critical') + raise + # Skip jobs that were already in squeue + if sid in jobids: + _logme.log('{} still in squeue output'.format(sid), + 'verbose') + continue + scode = int(scode.split(':')[-1]) + squeue.append((sid, sarr, sname, suser, spartition, sstate, + snodelist, snodes, scpus, scode)) + else: + _logme.log('No job info in sacct', 'debug') + + # Sanitize data + for sinfo in squeue: + if len(sinfo) == 10: + [sid, sarr, sname, suser, spartition, sstate, sndlst, + snodes, scpus, scode] = sinfo + else: + _sys.stderr.write('{}'.format(repr(sinfo))) + raise _ClusterError('Queue parsing error, expected 10 items ' + 'in output of squeue and sacct, got {}\n' + .format(len(sinfo))) + if partition and spartition != partition: + continue + if not isinstance(sid, str): + sid = str(sid) if sid else None + else: + sarr = None + if not isinstance(snodes, int): + snodes = int(snodes) if snodes else None + if not isinstance(scpus, int): + scpus = int(scpus) if snodes else None + if not isinstance(scode, int): + scode = int(scode) if scode else None + # Convert user from ID to name + if suser.isdigit(): + suser = _pwd.getpwuid(int(suser)).pw_name + if user and suser != user: + continue + # Attempt to parse nodelist + snodelist = [] + if sndlst: + if nodequery.search(sndlst): + nsplit = nodequery.findall(sndlst) + for nrg in nsplit: + node, rge = nrg + if not rge: + snodelist.append(node) + else: + for reg in rge.strip('[]').split(','): + # Node range + if '-' in reg: + start, end = [int(i) for i in reg.split('-')] + for i in range(start, end): + snodelist.append('{}{}'.format(node, i)) + else: + snodelist.append('{}{}'.format(node, reg)) + else: + snodelist = sndlst.split(',') + + yield (sid, sname, suser, spartition, sstate, snodelist, + snodes, scpus, scode) + + +def parse_strange_options(option_dict): + """Parse all options that cannot be handled by the regular function. + + Returns: + list: A list of strings + dict: Altered version of option_dict + """ + outlist = [] + # Handle cores separately + nodes = int(option_dict.pop('nodes')) if 'nodes' in option_dict else 1 + cores = int(option_dict.pop('cores')) if 'cores' in option_dict else 1 + + outlist.append('#SBATCH --ntasks {}'.format(nodes)) + outlist.append('#SBATCH --cpus-per-task {}'.format(cores)) + + return outlist, option_dict diff --git a/fyrd/batch_systems/torque.py b/fyrd/batch_systems/torque.py new file mode 100644 index 0000000..0d81c95 --- /dev/null +++ b/fyrd/batch_systems/torque.py @@ -0,0 +1,247 @@ +# -*- coding: utf-8 -*- +""" +Define functions for using the Torque batch system +""" +import os as _os +import re as _re +from time import sleep as _sleep +import xml.etree.ElementTree as _ET +from subprocess import check_output as _check_output +from subprocess import CalledProcessError as _CalledProcessError + +from .. import run as _run +from .. import logme as _logme +from .. import ClusterError as _ClusterError +from .. import script_runners as _scrpts +from .. import submission_scripts as _sscrpt +_Script = _sscrpt.Script + + +PREFIX = '#PBS' + +# Define torque-to-slurm mappings +TORQUE_SLURM_STATES = { + 'C': 'completed', + 'E': 'completing', + 'H': 'held', # Not a SLURM state + 'Q': 'pending', + 'R': 'running', + 'T': 'suspended', + 'W': 'running', + 'S': 'suspended', +} + + +############################################################################### +# Normalization Functions # +############################################################################### + + +def normalize_job_id(job_id): + """Convert the job id into job_id, array_id.""" + job_id = job_id.split('.')[0] + if '[' in job_id: + job_id, array_id = job_id.split('[') + job_id = job_id.strip('[]') + array_id = array_id.strip('[]') + if not array_id: + array_id = None + else: + array_id = None + return job_id, array_id + + +def normalize_state(state): + """Convert state into standadized (slurm style) state.""" + if state.upper() in TORQUE_SLURM_STATES: + state = TORQUE_SLURM_STATES[state.upper()] + return state + + +############################################################################### +# Job Sumission Functions # +############################################################################### + + +def gen_scripts(job_object, command, args, precmd, modstr): + """Create script object for job, does not create a sep. exec script.""" + scrpt = _os.path.join(job_object.scriptpath, + '{}.cluster.qsub'.format(job_object.name)) + + sub_script = _scrpts.CMND_RUNNER_TRACK.format( + precmd=precmd, usedir=job_object.runpath, name=job_object.name, + command=command + ) + return _Script(script=sub_script, file_name=scrpt), None + + +def submit(file_name, dependencies=None): + """Submit any file with dependencies to Torque. + + Attributes: + file_name (str): Path to an existing file + dependencies (list): List of dependencies + + Returns: + job_id (str) + """ + _logme.log('Submitting to torque', 'debug') + if dependencies: + deps = '-W depend={}'.format( + ','.join(['afterok:' + str(d) for d in dependencies])) + args = ['qsub', deps, file_name] + else: + args = ['qsub', file_name] + + # Try to submit job 5 times + code, stdout, stderr = _run.cmd(args, tries=5) + if code == 0: + job_id, _ = normalize_job_id(stdout.split('.')[0]) + elif code == 17 and 'Unable to open script file' in stderr: + _logme.log('qsub submission failed due to an already existing ' + 'script file, attempting to rename file and try ' + 'again.\nstderr: {}, stdout: {}, cmnd: {}' + .format(stderr, stdout, args), 'error') + new_name = args[1] + '.resub' + _os.rename(args[1], new_name) + _logme.log('renamed script {} to {}, resubmitting' + .format(args[1], new_name), 'info') + args[1] = new_name + code, stdout, stderr = _run.cmd(args, tries=5) + if code == 0: + job_id, _ = normalize_job_id(stdout.split('.')[0]) + else: + _logme.log('Resubmission still failed, aborting', + 'critical') + raise _CalledProcessError(code, args, stdout, stderr) + else: + if stderr.startswith('qsub: submit error ('): + raise _ClusterError('qsub submission failed with error: ' + + '{}, command: {}'.format(stderr, args)) + else: + _logme.log( + 'qsub failed with code {}\nstdout: {}\nstderr: {}' + .format(code, stdout, stderr), 'critical' + ) + raise _CalledProcessError(code, args, stdout, stderr) + return job_id + + +############################################################################### +# Queue Parsing Functions # +############################################################################### + + +def queue_parser(user=None, partition=None): + """Iterator for torque queues. + + Use the `qstat -x -t` command to get an XML queue for compatibility. + + Args: + user: optional user name to pass to qstat to filter queue with + partiton: optional partition to filter the queue with + + Yields: + tuple: job_id, array_id, name, userid, partition, state, nodelist, + numnodes, ntpernode, exit_code + + numcpus is currently always 1 as most torque queues treat every core as a + node. + """ + # I am not using run.cmd because I want to catch XML errors also + try_count = 0 + qargs = ['qstat', '-x', '-t'] + r = _re.compile('.*?') + while True: + try: + xmlstr = _check_output(qargs) + try: + xmlstr = xmlstr.decode() + except AttributeError: + pass + # Get rid of the Variable_List as it is just the environment + # and can sometimes have nonsensical characters. + xmlstr = r.sub('', xmlstr) + xmlqueue = _ET.fromstring(xmlstr) + except _CalledProcessError: + _sleep(1) + if try_count == 5: + raise + else: + try_count += 1 + except _ET.ParseError: + # ElementTree throws error when string is empty + _sleep(1) + if try_count == 1: + xmlqueue = None + break + else: + try_count += 1 + else: + break + + if xmlqueue is not None: + for xmljob in xmlqueue: + job_id, array_id = normalize_job_id(xmljob.find('Job_Id').text) + job_owner = xmljob.find('Job_Owner').text.split('@')[0] + if user and job_owner != user: + continue + job_name = xmljob.find('Job_Name').text + job_queue = xmljob.find('queue').text + job_state = xmljob.find('job_state').text + job_state = TORQUE_SLURM_STATES[job_state] + _logme.log('Job {} state: {}'.format(job_id, job_state), + 'debug') + ndsx = xmljob.find('exec_host') + if hasattr(ndsx, 'text') and ndsx.text: + nds = ndsx.text.split('+') + else: + nds = [] + nodes = [] + # Convert node range to individual nodes in list + for node in nds: + if '-' in node: + nm, num = node.split('/') + for i in range(*[int(i) for i in num.split('-')]): + nodes.append(nm + '/' + str(i).zfill(2)) + else: + nodes.append(node) + # I assume that every 'node' is a core, as that is the + # default for torque, but it isn't always true + job_threads = len(nodes) + exitcode = xmljob.find('exit_status') + if hasattr(exitcode, 'text'): + exitcode = int(exitcode.text) + else: + exitcode = None + + if partition and job_queue != partition: + continue + + # Torque doesn't have a variable scpu + scpus = 1 + yield (job_id, array_id, job_name, job_owner, job_queue, job_state, + nodes, job_threads, scpus, exitcode) + + +def parse_strange_options(option_dict): + """Parse all options that cannot be handled by the regular function. + + Returns: + list: A list of strings + dict: Altered version of option_dict + """ + outlist = [] + # Handle cores separately + nodes = int(option_dict.pop('nodes')) if 'nodes' in option_dict else 1 + cores = int(option_dict.pop('cores')) if 'cores' in option_dict else 1 + + outstring = '#PBS -l nodes={}:ppn={}'.format(nodes, cores) + if 'features' in option_dict: + outstring += ':' + ':'.join( + _run.opt_split(option_dict.pop('features'), (',', ':'))) + if 'qos' in option_dict: + outstring += ',qos={}'.format(option_dict.pop('qos')) + outlist.append(outstring) + + return outlist, option_dict diff --git a/fyrd/conf.py b/fyrd/conf.py index a9b0713..2da5a5c 100644 --- a/fyrd/conf.py +++ b/fyrd/conf.py @@ -26,7 +26,8 @@ from . import run as _run from . import logme as _logme -from . import options as _opt +from . import batch_systems as _batch +_opt = _batch.options ############################################################################### @@ -53,6 +54,7 @@ 'queue_type': 'auto', 'sbatch': None, # Path to sbatch command 'qsub': None, # Path to qsub command + 'progressbar': True, # Not implemented yet # 'db': _os.path.join(CONFIG_PATH, 'db.sql'), }, @@ -69,9 +71,6 @@ CONFIG_PATH, 'profiles.txt' ) }, - 'jobqueue': { - 'jobno': 1 - }, } CONF_HELP = { @@ -105,13 +104,14 @@ running or completed again after some time so it makes sense to wait a bit, but not forever. The default is 45 minutes: 2700 seconds. - queue_type (str): the type of queue to use, one of 'torque', - 'slurm', 'local', 'auto'. Default is auto to - auto-detect the queue. + queue_type (str): the type of queue to use, one of the batch + systems (e.g. 'slurm') or 'auto'. Default is + auto to auto-detect the queue. sbatch (str): A path to the sbatch executable, only required for slurm mode if sbatch is not in the PATH. qsub (str): A path to the qsub executable, only required for torque mode if sbatch is not in the PATH. + progressbar (bool): Show a progress bar when waiting for jobs db_path (str): Where to put the job database (Not implemented) """ ), @@ -149,13 +149,6 @@ profile_file (str): the config file where profiles are defined. """ ), - 'jobqueue': _dnt( - """ - [jobqueue] - Sets options for the local queue system, will be removed in the future - in favor of database. - """ - ), } # Pre-defined profiles, 'DEFAULT' is required. diff --git a/fyrd/helpers.py b/fyrd/helpers.py index cdf1d02..79216e0 100644 --- a/fyrd/helpers.py +++ b/fyrd/helpers.py @@ -11,9 +11,11 @@ from . import run as _run from . import conf as _conf from . import logme as _logme -from . import options as _options +from . import batch_systems as _batch from .job import Job as _Job +_options = _batch.options + ############################################################################### # Try Import Non-Required Modules # ############################################################################### @@ -70,20 +72,19 @@ def parapply_summary(jobs, df, func, args=(), profile=None, applymap=False, jobs, df, func, args=args, profile=profile, applymap=applymap, name=name, imports=imports, **kwds ) - else: - kwargs = dict( - args=args, profile=profile, applymap=applymap, - name=name, imports=imports - ) - kwds = _options.sanitize_arguments(kwds) - kwargs.update(kwds) - kwargs['imports'] = _run.get_all_imports(func, kwargs) - kwargs['syspaths'] = _run.update_syspaths(func, kwargs) - return _wrap_runner( - _parapply_summary, - *(jobs, df, func), - **kwargs + kwargs = dict( + args=args, profile=profile, applymap=applymap, + name=name, imports=imports ) + kwds = _options.sanitize_arguments(kwds) + kwargs.update(kwds) + kwargs['imports'] = _run.get_all_imports(func, kwargs) + kwargs['syspaths'] = _run.update_syspaths(func, kwargs) + return _wrap_runner( + _parapply_summary, + *(jobs, df, func), + **kwargs + ) def _parapply_summary(jobs, df, func, args=(), profile=None, applymap=False, @@ -151,21 +152,20 @@ def parapply(jobs, df, func, args=(), profile=None, applymap=False, merge_axis=merge_axis, merge_apply=merge_apply, name=name, imports=imports, **kwds ) - else: - kwargs = dict( - args=args, profile=profile, applymap=applymap, - merge_axis=merge_axis, merge_apply=merge_apply, name=name, - imports=imports - ) - kwds = _options.sanitize_arguments(kwds) - kwargs.update(kwds) - kwargs['imports'] = _run.get_all_imports(func, kwargs) - kwargs['syspaths'] = _run.update_syspaths(func, kwargs) - return _wrap_runner( - _parapply, - *(jobs, df, func), - **kwargs - ) + kwargs = dict( + args=args, profile=profile, applymap=applymap, + merge_axis=merge_axis, merge_apply=merge_apply, name=name, + imports=imports + ) + kwds = _options.sanitize_arguments(kwds) + kwargs.update(kwds) + kwargs['imports'] = _run.get_all_imports(func, kwargs) + kwargs['syspaths'] = _run.update_syspaths(func, kwargs) + return _wrap_runner( + _parapply, + *(jobs, df, func), + **kwargs + ) def _parapply(jobs, df, func, args=(), profile=None, applymap=False, diff --git a/fyrd/job.py b/fyrd/job.py index 262ddf0..f9053af 100644 --- a/fyrd/job.py +++ b/fyrd/job.py @@ -7,7 +7,6 @@ from uuid import uuid4 as _uuid from time import sleep as _sleep from datetime import datetime as _dt -from subprocess import CalledProcessError as _CalledProcessError # Try to use dill, revert to pickle if not found import dill as _pickle @@ -21,13 +20,11 @@ from . import conf as _conf from . import queue as _queue from . import logme as _logme -from . import local as _local -from . import options as _options from . import script_runners as _scrpts +from . import batch_systems as _batch from . import ClusterError as _ClusterError -from .submission_scripts import Script as _Script from .submission_scripts import Function as _Function - +_options = _batch.options __all__ = ['Job'] @@ -93,13 +90,23 @@ class Job(object): """ id = None + name = None + suffix = None submitted = False written = False found = False submit_time = None + state = None + kind = None + + # Runtime + nodes = None + cores = None + modules = None - # Holds a pool object if we are in local mode - pool_job = None + # Files + outfile = None + errfile = None # Scripts submission = None @@ -134,6 +141,11 @@ class Job(object): # Track update status _updating = False + # Track preparations + initialized = False + scripts_ready = False + _kwargs = None + # Auto Cleaning clean_files = _conf.get_option('jobs', 'clean_files') clean_outputs = _conf.get_option('jobs', 'clean_outputs') @@ -168,11 +180,8 @@ def __init__(self, command, args=None, kwargs=None, name=None, qtype=None, kwds = _options.check_arguments(kwds) _logme.log('Args post-check: {}'.format(kwds), 'debug') - # Override autoclean state (set in config file) - if 'clean_files' in kwds: - self.clean_files = kwds.pop('clean_files') - if 'clean_outputs' in kwds: - self.clean_outputs = kwds.pop('clean_outputs') + # Create a unique short UUID for this job + self.uuid = str(_uuid()).split('-')[0] # Path handling [kwds, self.runpath, @@ -181,10 +190,111 @@ def __init__(self, command, args=None, kwargs=None, name=None, qtype=None, # Save command self.command = command self.args = args + self.kwargs = kwargs + self.profile = profile + + # Get environment + if not _batch.MODE: + _batch.MODE = _batch.get_cluster_environment() + self.qtype = qtype if qtype else _batch.MODE + self.queue = _queue.Queue(user='self', qtype=self.qtype) + self.batch = _batch.get_batch_system(self.qtype) + self.state = 'Not_Submitted' + + # Save keywords for posterity and parsing + self.kwds = kwds + + self.name = self._update_name(name) + + self.initialize() + + ########################################################################## + # Public Methods # + ########################################################################## + + ################ + # Properties # + ################ + + @property + def files(self): + """Build a list of files associated with this class.""" + files = [self.submission] + if self.kind == 'script': + files.append(self.exec_script) + if self.kind == 'function': + files.append(self.function) + return files + + @property + def runtime(self): + """Return the runtime.""" + if not self.done: + _logme.log('Cannot get runtime as not yet complete.' 'warn') + return None + if not self.start: + self.get_times() + return self.end-self.start + + @property + def done(self): + """Check if completed or not. + + Updates the Job and Queue. + + Returns: + Bool: True if complete, False otherwise. + """ + # We have the same statement twice to try and avoid updating. + if self.state in _queue.DONE_STATES: + return True + if not self._updating: + self.update() + if self.state in _queue.DONE_STATES: + return True + return False + + @property + def outfiles(self): + """A list of all outfiles associated with this Job.""" + outfiles = [self.outfile, self.errfile] + if self.poutfile: + outfiles.append(self.poutfile) + return outfiles + + @property + def incomplete_outfiles(self): + """A list of all outfiles that haven't already been fetched.""" + outfiles = [] + if self.outfile and not self._got_stdout: + outfiles.append(self.outfile) + if self.errfile and not self._got_stderr: + outfiles.append(self.errfile) + if self.poutfile and not self._got_out: + outfiles.append(self.poutfile) + return outfiles + + ############################### + # Core Job Handling Methods # + ############################### + + def initialize(self): + """Make self runnable using set attributes.""" + kwds = self.kwds + + # Override autoclean state (set in config file) + if 'clean_files' in kwds: + self.clean_files = kwds.pop('clean_files') + if 'clean_outputs' in kwds: + self.clean_outputs = kwds.pop('clean_outputs') + + # Set suffix + self.suffix = kwds.pop('suffix') if 'suffix' in kwds \ + else _conf.get_option('jobs', 'suffix') # Merge in profile, this includes all args from the DEFAULT profile # as well, ensuring that those are always set at a minumum. - profile = profile if profile else 'DEFAULT' + profile = self.profile if self.profile else 'DEFAULT' prof = _conf.get_profile(profile) if not prof: raise _ClusterError('No profile found for {}'.format(profile)) @@ -201,51 +311,14 @@ def __init__(self, command, args=None, kwargs=None, name=None, qtype=None, .format(opt, opt, arg), 'debug') kwds[opt] = arg - # Get environment - if not _queue.MODE: - _queue.MODE = _queue.get_cluster_environment() - self.qtype = qtype if qtype else _queue.MODE - self.queue = _queue.Queue(user='self', qtype=self.qtype) - self.state = 'Not_Submitted' - - # Set name - if not name: - if callable(command): - strcmd = str(command).strip('<>') - parts = strcmd.split(' ') - if parts[0] == 'bound': - name = '_'.join(parts[2:3]) - else: - parts.remove('function') - try: - parts.remove('built-in') - except ValueError: - pass - name = parts[0] - else: - name = command.split(' ')[0].split('/')[-1] - - # Make sure name not in queue - self.uuid = str(_uuid()).split('-')[0] - names = [i.name.split('.')[0] for i in self.queue] - namecnt = len([i for i in names if i == name]) - name = '{}.{}.{}'.format(name, namecnt, self.uuid) - self.name = name - # Set modules self.modules = kwds.pop('modules') if 'modules' in kwds else None if self.modules: self.modules = _run.opt_split(self.modules, (',', ';')) - # Make sure args are a tuple or dictionary - if args: - if isinstance(args, str): - args = tuple(args) - if not isinstance(args, (tuple, dict)): - try: - args = tuple(args) - except TypeError: - args = (args,) + # Make sure args are a tuple + if self.args: + self.args = tuple(_run.listify(self.args)) # In case cores are passed as None if 'nodes' not in kwds: @@ -256,8 +329,6 @@ def __init__(self, command, args=None, kwargs=None, name=None, qtype=None, self.cores = kwds['cores'] # Set output files - suffix = kwds.pop('suffix') if 'suffix' in kwds \ - else _conf.get_option('jobs', 'suffix') if 'outfile' in kwds: pth, fle = _os.path.split(kwds['outfile']) if not pth: @@ -265,7 +336,7 @@ def __init__(self, command, args=None, kwargs=None, name=None, qtype=None, kwds['outfile'] = _os.path.join(pth, fle) else: kwds['outfile'] = _os.path.join( - self.outpath, '.'.join([name, suffix, 'out'])) + self.outpath, '.'.join([self.name, self.suffix, 'out'])) if 'errfile' in kwds: pth, fle = _os.path.split(kwds['errfile']) if not pth: @@ -273,7 +344,7 @@ def __init__(self, command, args=None, kwargs=None, name=None, qtype=None, kwds['errfile'] = _os.path.join(pth, fle) else: kwds['errfile'] = _os.path.join( - self.outpath, '.'.join([name, suffix, 'err'])) + self.outpath, '.'.join([self.name, self.suffix, 'err'])) self.outfile = kwds['outfile'] self.errfile = kwds['errfile'] @@ -281,19 +352,32 @@ def __init__(self, command, args=None, kwargs=None, name=None, qtype=None, if 'depends' in kwds: dependencies = _run.listify(kwds.pop('depends')) self.dependencies = [] - errmsg = 'Dependencies must be number or list' + errmsg = 'Dependencies must be number, numeric string or Job' for dependency in dependencies: - if isinstance(dependency, str): - if not dependency.isdigit(): - raise _ClusterError(errmsg) - dependency = int(dependency) - if not isinstance(dependency, (int, Job)): + if not isinstance(dependency, (str, Job)): raise _ClusterError(errmsg) self.dependencies.append(dependency) + # Save parsed keywords as _kwargs + self._kwargs = kwds + + self.initialized = True + + return self + + def gen_scripts(self): + """Create the script objects from the set parameters.""" + if not self.initialized: + self.initialize() + ###################################### # Command and Function Preparation # ###################################### + command = self.command + args = self.args + kwargs = self.kwargs # Not self._kwargs + name = self._update_name() + kwds = self._kwargs # Get imports imports = kwds.pop('imports') if 'imports' in kwds else None @@ -316,7 +400,7 @@ def __init__(self, command, args=None, kwargs=None, name=None, qtype=None, if callable(command): self.kind = 'function' script_file = _os.path.join( - self.scriptpath, '{}_func.{}.py'.format(name, suffix) + self.scriptpath, '{}_func.{}.py'.format(name, self.suffix) ) self.poutfile = self.outfile + '.func.pickle' self.function = _Function( @@ -343,70 +427,22 @@ def __init__(self, command, args=None, kwargs=None, name=None, qtype=None, ##################### # Build execution wrapper with modules - precmd = '' + modstr = '' if self.modules: for module in self.modules: - precmd += 'module load {}\n'.format(module) - - # Create queue-dependent scripts - sub_script = '' - if self.qtype == 'slurm': - scrpt = _os.path.join( - self.scriptpath, '{}.{}.sbatch'.format(name, suffix) - ) + modstr += 'module load {}\n'.format(module) - # We use a separate script and a single srun command to avoid - # issues with multiple threads running at once - exec_script = _os.path.join(self.scriptpath, - '{}.{}.script'.format(name, suffix)) - exe_script = _scrpts.CMND_RUNNER_TRACK.format( - precmd=precmd, usedir=self.runpath, name=name, command=command) - # Create the exec_script Script object - self.exec_script = _Script(script=exe_script, - file_name=exec_script) - - # Add all of the keyword arguments at once - precmd = _options.options_to_string(kwds, self.qtype) + precmd - - ecmnd = 'srun bash {}'.format(exec_script) - sub_script = _scrpts.SCRP_RUNNER.format(precmd=precmd, - script=exec_script, - command=ecmnd) - - elif self.qtype == 'torque': - scrpt = _os.path.join(self.scriptpath, - '{}.cluster.qsub'.format(name)) - - # Add all of the keyword arguments at once - precmd = _options.options_to_string(kwds, self.qtype) + precmd - - sub_script = _scrpts.CMND_RUNNER_TRACK.format( - precmd=precmd, usedir=self.runpath, name=name, command=command) - - elif self.qtype == 'local': - # Create the pool - if not _local.JQUEUE or not _local.JQUEUE.runner.is_alive(): - threads = kwds['threads'] if 'threads' in kwds \ - else _local.THREADS - _local.JQUEUE = _local.JobQueue(cores=threads) - - scrpt = _os.path.join(self.scriptpath, '{}.cluster'.format(name)) - sub_script = _scrpts.CMND_RUNNER_TRACK.format( - precmd=precmd, usedir=self.runpath, name=name, command=command) + # Add all of the keyword arguments at once + precmd = _options.options_to_string(kwds, self.qtype) + '\n\n' + modstr - else: - raise _ClusterError('Invalid queue type') - - # Create the submission Script object - self.submission = _Script(script=sub_script, - file_name=scrpt) + # Create queue-dependent scripts + self.submission, self.exec_script = self.batch.gen_scripts( + self, command, args, precmd, modstr + ) - # Save the keyword arguments for posterity - self.kwargs = kwds + self.scripts_ready = True - #################### - # Public Methods # - #################### + return self def write(self, overwrite=True): """Write all scripts. @@ -414,6 +450,8 @@ def write(self, overwrite=True): Args: overwrite (bool): Overwrite existing files, defaults to True. """ + if not self.scripts_ready: + self.gen_scripts() _logme.log('Writing files, overwrite={}'.format(overwrite), 'debug') self.submission.write(overwrite) if self.exec_script: @@ -421,32 +459,7 @@ def write(self, overwrite=True): if self.function: self.function.write(overwrite) self.written = True - - def clean(self, delete_outputs=None, get_outputs=True): - """Delete all scripts created by this module, if they were written. - - Args: - delete_outputs (bool): also delete all output and err files, - but get their contents first. - get_outputs (bool): if delete_outputs, save outputs before - deleting. - """ - _logme.log('Cleaning outputs, delete_outputs={}' - .format(delete_outputs), 'debug') - if delete_outputs is None: - delete_outputs = self.clean_outputs - assert isinstance(delete_outputs, bool) - for jobfile in [self.submission, self.exec_script, self.function]: - if jobfile: - jobfile.clean() - if delete_outputs: - _logme.log('Deleting output files.', 'debug') - if get_outputs: - self.fetch_outputs(delete_files=True) - for f in self.outfiles: - if _os.path.isfile(f): - _logme.log('Deleteing {}'.format(f), 'debug') - _os.remove(f) + return self def submit(self, wait_on_max_queue=True): """Submit this job. @@ -481,9 +494,9 @@ def submit(self, wait_on_max_queue=True): 'error' ) return self - dependencies.append(int(depend.id)) + dependencies.append(str(depend.id)) else: - dependencies.append(int(depend)) + dependencies.append(str(depend)) # Wait on the queue if necessary if wait_on_max_queue: @@ -527,91 +540,14 @@ def submit(self, wait_on_max_queue=True): 'returned an unrecognized value {}' .format(dep_check)) - if self.qtype == 'local': - # Normal mode dependency tracking uses only integer job numbers - _logme.log('Submitting to local', 'debug') - command = 'bash {}'.format(self.submission.file_name) - fileargs = dict(stdout=self.outfile, - stderr=self.errfile) - - # Make sure the global job pool exists - if not _local.JQUEUE or not _local.JQUEUE.runner.is_alive(): - _local.JQUEUE = _local.JobQueue(cores=_local.THREADS) - self.id = _local.JQUEUE.add(_run.cmd, args=(command,), - kwargs=fileargs, - dependencies=depends, - cores=self.cores) - self.submitted = True - self.submit_time = _dt.now() - self.state = 'submitted' - - elif self.qtype == 'slurm': - _logme.log('Submitting to slurm', 'debug') - if self.depends: - deps = '--dependency=afterok:{}'.format( - ':'.join([str(d) for d in depends])) - args = ['sbatch', deps, self.submission.file_name] - else: - args = ['sbatch', self.submission.file_name] + self.id = self.batch.submit( + self.submission.file_name, + self.depends + ) - # Try to submit job 5 times - code, stdout, stderr = _run.cmd(args, tries=5) - if code == 0: - self.id = int(stdout.split(' ')[-1]) - else: - _logme.log('sbatch failed with code {}\n'.format(code) + - 'stdout: {}\nstderr: {}'.format(stdout, stderr), - 'critical') - raise _CalledProcessError(code, args, stdout, stderr) - self.submitted = True - self.submit_time = _dt.now() - self.state = 'submitted' - - elif self.qtype == 'torque': - _logme.log('Submitting to torque', 'debug') - if self.depends: - deps = '-W depend={}'.format( - ','.join(['afterok:' + str(d) for d in depends])) - args = ['qsub', deps, self.submission.file_name] - else: - args = ['qsub', self.submission.file_name] - - # Try to submit job 5 times - code, stdout, stderr = _run.cmd(args, tries=5) - if code == 0: - self.id = int(stdout.split('.')[0]) - elif code == 17 and 'Unable to open script file' in stderr: - _logme.log('qsub submission failed due to an already existing ' - 'script file, attempting to rename file and try ' - 'again.\nstderr: {}, stdout: {}, cmnd: {}' - .format(stderr, stdout, args), 'error') - new_name = args[1] + '.resub' - _os.rename(args[1], new_name) - _logme.log('renamed script {} to {}, resubmitting' - .format(args[1], new_name), 'info') - args[1] = new_name - code, stdout, stderr = _run.cmd(args, tries=5) - if code == 0: - self.id = int(stdout.split('.')[0]) - else: - _logme.log('Resubmission still failed, aborting', - 'critical') - raise _CalledProcessError(code, args, stdout, stderr) - else: - if stderr.startswith('qsub: submit error ('): - raise _ClusterError('qsub submission failed with error: ' + - '{}, command: {}'.format(stderr, args)) - else: - _logme.log( - 'qsub failed with code {}\nstdout: {}\nstderr: {}' - .format(code, stdout, stderr), 'critical' - ) - raise _CalledProcessError(code, args, stdout, stderr) - self.submitted = True - self.submit_time = _dt.now() - self.state = 'submitted' - else: - raise _ClusterError("Invalid queue type {}".format(self.qtype)) + self.submitted = True + self.submit_time = _dt.now() + self.state = 'submitted' if not self.submitted: raise _ClusterError('Submission appears to have failed, this ' @@ -619,13 +555,109 @@ def submit(self, wait_on_max_queue=True): return self + def scrub(self, confirm=True): + """Clean everything and reset to an unrun state.""" + msg = ("This will delete all outputs stored in this job, as well " + "as all output files, job files, and scripts. Are you sure " + "you want to do this?") + if confirm: + _run.get_yesno(msg, default='n') + # Clean old set + self.clean(delete_outputs=True) + # Reset runtime attributes + self.initialized = False + self.scripts_ready = False + self.written = False + self.submitted = False + self.id = None + self.found = False + self.queue_info = None + self.state = 'Not_Submitted' + self._got_out = False + self._got_stdout = False + self._got_stderr = False + self._got_exitcode = False + self._out = None + self._stdout = None + self._stderr = None + self._exitcode = None + self._got_times = False + self._updating = False + self.start = None + self.end = None + return self.update() + def resubmit(self): """Attempt to auto resubmit, deletes prior files.""" - self.clean(delete_outputs=True) - self.state = 'Not_Submitted' + self.scrub(confirm=False) + # Rerun + self.initialize() + self.gen_scripts() self.write() return self.submit() + def clean(self, delete_outputs=None, get_outputs=True): + """Delete all scripts created by this module, if they were written. + + Args: + delete_outputs (bool): also delete all output and err files, + but get their contents first. + get_outputs (bool): if delete_outputs, save outputs before + deleting. + """ + _logme.log('Cleaning outputs, delete_outputs={}' + .format(delete_outputs), 'debug') + if delete_outputs is None: + delete_outputs = self.clean_outputs + assert isinstance(delete_outputs, bool) + for jobfile in [self.submission, self.exec_script, self.function]: + if jobfile: + jobfile.clean() + if delete_outputs: + _logme.log('Deleting output files.', 'debug') + if get_outputs: + self.fetch_outputs(delete_files=True) + for f in self.outfiles: + if _os.path.isfile(f): + _logme.log('Deleteing {}'.format(f), 'debug') + _os.remove(f) + return self + + ###################### + # Queue Management # + ###################### + + def update(self, fetch_info=True): + """Update status from the queue. + + Args: + fetch_info (bool): Fetch basic job info if complete. + """ + if not self._updating: + self._update(fetch_info) + else: + _logme.log('Already updating, aborting.', 'debug') + return self + + def update_queue_info(self): + """Set queue_info from the queue even if done.""" + _logme.log('Updating queue_info', 'debug') + queue_info1 = self.queue[self.id] + self.queue.update() + queue_info2 = self.queue[self.id] + if queue_info2: + self.queue_info = queue_info2 + elif queue_info1: + self.queue_info = queue_info1 + elif self.queue_info is None and self.submitted: + _logme.log('Cannot find self in the queue and queue_info is empty', + 'warn') + return self.queue_info + + ################################# + # Output Handling and Waiting # + ################################# + def wait(self): """Block until job completes.""" if not self.submitted: @@ -652,7 +684,7 @@ def wait(self): _logme.log('Job failed with exitcode {}' .format(self.exitcode), 'debug') return False - if self.wait_for_files(caution_message=False): + if self._wait_for_files(caution_message=False): self.update() if self.state == 'disappeared': _logme.log('Job files found for disappered job, assuming ' @@ -665,167 +697,27 @@ def wait(self): 'failure', 'error') return False - def wait_for_files(self, btme=None, caution_message=False): - """Block until files appear up to 'file_block_time' in config file. + def get_output(self, save=True, delete_file=None, update=True, + raise_on_error=True): + """Get output of function or script. - Aborts after 2 seconds if job exit code is not 0. + This is the same as stdout for a script, or the function output for + a function. + + By default, output file is kept unless delete_file is True or + self.clean_files is True. Args: - btme (int): Number of seconds to try for before giving - up, default set in config file. - caution_message (bool): Display a message if this is taking - a while. + save (bool): Save the output to self.out, default True. + Would be a good idea to set to False if the + output is huge. + delete_file (bool): Delete the output file when getting + update (bool): Update job info from queue first. + raise_on_error (bool): If the returned output is an Exception, + raise it. Returns: - bool: True if files found - """ - if not self.done: - _logme.log("Cannot wait for files if we aren't complete", - 'warn') - return False - wait_time = 0.1 # seconds - if btme: - lvl = 'debug' - else: - lvl = 'warn' - btme = _conf.get_option('jobs', 'file_block_time') - start = _dt.now() - dsp = False - _logme.log('Checking for output files', 'debug') - while True: - runtime = (_dt.now() - start).seconds - if caution_message and runtime > 1: - _logme.log('Job complete.', 'info') - _logme.log('Waiting for output files to appear.', 'info') - caution_message = False - if not dsp and runtime > 20: - _logme.log('Still waiting for output files to appear', - 'info') - dsp = True - count = 0 - outfiles = self.incomplete_outfiles - tlen = len(outfiles) - if not outfiles: - _logme.log('No incomplete outfiles, assuming all found in ' + - '{} seconds'.format(runtime), 'debug') - break - for i in outfiles: - if _os.path.isfile(i): - count += 1 - if count == tlen: - _logme.log('All output files found in {} seconds' - .format(runtime), 'debug') - break - _sleep(wait_time) - if runtime > btme: - _logme.log('Job completed but files have not appeared for ' + - '>{} seconds'.format(btme), lvl) - return False - self.update() - if runtime > 2 and self.get_exitcode(update=False) != 0: - _logme.log('Job failed with exit code {}.' - .format(self.exitcode) + ' Cannot find files.', - 'error') - return False - if _queue.MODE == 'local': - _logme.log('Job output files were not found.', 'error') - _logme.log('Expected files: {}'.format(self.outfiles)) - return False - return True - - def get(self, save=True, cleanup=None, delete_outfiles=None, - del_no_save=None, raise_on_error=True): - """Block until job completed and return output of script/function. - - By default saves all outputs to this class and deletes all intermediate - files. - - Args: - save (bool): Save all outputs to the class also (advised) - cleanup (bool): Clean all intermediate files after job - completes. - delete_outfiles (bool): Clean output files after job completes. - del_no_save (bool): Delete output files even if `save` is - `False` - raise_on_error (bool): If the returned output is an Exception, - raise it. - - Returns: - str: Function output if Function, else STDOUT - """ - _logme.log(('Getting outputs, cleanup={}, autoclean={}, ' - 'delete_outfiles={}').format( - cleanup, self.clean_files, delete_outfiles - ), 'debug') - # Wait for queue - status = self.wait() - if status == 'disappeared': - _logme.log('Job disappeared from queue, attempting to get outputs', - 'debug') - try: - self.fetch_outputs(save=save, delete_files=False, - get_stats=False) - except IOError: - _logme.log('Job disappeared from the queue and files could not' - ' be found, job must have died and been deleted ' - 'from the queue', 'critical') - raise IOError('Job {} disappeared, output files missing' - .format(self)) - elif status is not True: - _logme.log('Wait failed, cannot get outputs, aborting', 'error') - self.update() - if _os.path.isfile(self.errfile): - if _logme.MIN_LEVEL in ['debug', 'verbose']: - _sys.stderr.write('STDERR of Job:\n') - _sys.stderr.write(self.get_stderr(delete_file=False, - update=False)) - if self.poutfile and _os.path.isfile(self.poutfile): - _logme.log('Getting pickled output', 'debug') - self.get_output(delete_file=False, update=False, - raise_on_error=raise_on_error) - else: - _logme.log('Pickled out file does not exist, cannot get error', - 'debug') - return - else: - # Get output - _logme.log('Wait complete, fetching outputs', 'debug') - self.fetch_outputs(save=save, delete_files=False) - out = self.out if save else self.get_output(save=save) - # Cleanup - if cleanup is None: - cleanup = self.clean_files - else: - assert isinstance(cleanup, bool) - if delete_outfiles is None: - delete_outfiles = self.clean_outputs - if save is False: - delete_outfiles = del_no_save if del_no_save is not None else False - if cleanup: - self.clean(delete_outputs=delete_outfiles) - return out - - def get_output(self, save=True, delete_file=None, update=True, - raise_on_error=True): - """Get output of function or script. - - This is the same as stdout for a script, or the function output for - a function. - - By default, output file is kept unless delete_file is True or - self.clean_files is True. - - Args: - save (bool): Save the output to self.out, default True. - Would be a good idea to set to False if the - output is huge. - delete_file (bool): Delete the output file when getting - update (bool): Update job info from queue first. - raise_on_error (bool): If the returned output is an Exception, - raise it. - - Returns: - The output of the script or function. Always a string if script. + The output of the script or function. Always a string if script. """ _logme.log(('Getting output, save={}, clean_files={}, ' 'delete_file={}').format( @@ -843,7 +735,7 @@ def get_output(self, save=True, delete_file=None, update=True, self.update() if self.done: if update: - self.wait_for_files() + self._wait_for_files() else: _logme.log('Cannot get pickled output before job completes', 'warn') @@ -903,22 +795,22 @@ def get_stdout(self, save=True, delete_file=None, update=True): self.update() if self.done: if update: - self.wait_for_files() + self._wait_for_files() else: _logme.log('Job not done, attempting to get current STDOUT ' + 'anyway', 'info') - _logme.log('Getting stdout from {}'.format(self.kwargs['outfile']), + _logme.log('Getting stdout from {}'.format(self._kwargs['outfile']), 'debug') - if _os.path.isfile(self.kwargs['outfile']): + if _os.path.isfile(self._kwargs['outfile']): self.get_times(update=False) - stdout = open(self.kwargs['outfile']).read() + stdout = open(self._kwargs['outfile']).read() if stdout: stdouts = stdout.split('\n') stdout = '\n'.join(stdouts[2:-3]) + '\n' if delete_file is True or self.clean_files is True: - _logme.log('Deleting {}'.format(self.kwargs['outfile']), + _logme.log('Deleting {}'.format(self._kwargs['outfile']), 'debug') - _os.remove(self.kwargs['outfile']) + _os.remove(self._kwargs['outfile']) if save: self._stdout = stdout if self.done: @@ -926,7 +818,7 @@ def get_stdout(self, save=True, delete_file=None, update=True): return stdout else: _logme.log('No file at {}, cannot get stdout' - .format(self.kwargs['outfile']), 'warn') + .format(self._kwargs['outfile']), 'warn') return None def get_stderr(self, save=True, delete_file=None, update=True): @@ -958,18 +850,18 @@ def get_stderr(self, save=True, delete_file=None, update=True): self.update() if self.done: if update: - self.wait_for_files() + self._wait_for_files() else: _logme.log('Job not done, attempting to get current STDERR ' + 'anyway', 'info') - _logme.log('Getting stderr from {}'.format(self.kwargs['errfile']), + _logme.log('Getting stderr from {}'.format(self._kwargs['errfile']), 'debug') - if _os.path.isfile(self.kwargs['errfile']): - stderr = open(self.kwargs['errfile']).read() + if _os.path.isfile(self._kwargs['errfile']): + stderr = open(self._kwargs['errfile']).read() if delete_file is True or self.clean_files is True: - _logme.log('Deleting {}'.format(self.kwargs['errfile']), + _logme.log('Deleting {}'.format(self._kwargs['errfile']), 'debug') - _os.remove(self.kwargs['errfile']) + _os.remove(self._kwargs['errfile']) if save: self._stderr = stderr if self.done: @@ -977,7 +869,7 @@ def get_stderr(self, save=True, delete_file=None, update=True): return stderr else: _logme.log('No file at {}, cannot get stderr' - .format(self.kwargs['errfile']), 'warn') + .format(self._kwargs['errfile']), 'warn') return None def get_times(self, update=True): @@ -1000,14 +892,14 @@ def get_times(self, update=True): self.update() if self.done: if update: - self.wait_for_files() + self._wait_for_files() else: _logme.log('Cannot get times until job is complete.', 'warn') return None, None - _logme.log('Getting times from {}'.format(self.kwargs['outfile']), + _logme.log('Getting times from {}'.format(self._kwargs['outfile']), 'debug') - if _os.path.isfile(self.kwargs['outfile']): - stdout = open(self.kwargs['outfile']).read() + if _os.path.isfile(self._kwargs['outfile']): + stdout = open(self._kwargs['outfile']).read() if stdout: stdouts = stdout.split('\n') # Get times @@ -1024,7 +916,7 @@ def get_times(self, update=True): return self.start, self.end else: _logme.log('No file at {}, cannot get times' - .format(self.kwargs['outfile']), 'warn') + .format(self._kwargs['outfile']), 'warn') return None def get_exitcode(self, update=True): @@ -1047,7 +939,7 @@ def get_exitcode(self, update=True): self.update() if self.done: if update: - self.wait_for_files() + self._wait_for_files() else: _logme.log('Job is not complete, no exit code yet', 'info') return None @@ -1068,32 +960,6 @@ def get_exitcode(self, update=True): .format(self.name, code), 'error') return code - def update(self, fetch_info=True): - """Update status from the queue. - - Args: - fetch_info (bool): Fetch basic job info if complete. - """ - if not self._updating: - self._update(fetch_info) - else: - _logme.log('Already updating, aborting.', 'debug') - - def update_queue_info(self): - """Set queue_info from the queue even if done.""" - _logme.log('Updating queue_info', 'debug') - queue_info1 = self.queue[self.id] - self.queue.update() - queue_info2 = self.queue[self.id] - if queue_info2: - self.queue_info = queue_info2 - elif queue_info1: - self.queue_info = queue_info1 - elif self.queue_info is None and self.submitted: - _logme.log('Cannot find self in the queue and queue_info is empty', - 'warn') - return self.queue_info - def fetch_outputs(self, save=True, delete_files=None, get_stats=True): """Save all outputs in their current state. No return value. @@ -1120,43 +986,27 @@ def fetch_outputs(self, save=True, delete_files=None, get_stats=True): self.get_stdout(save=True, delete_file=delete_files, update=False) self.get_stderr(save=True, delete_file=delete_files, update=False) - @property - def files(self): - """Build a list of files associated with this class.""" - files = [self.submission] - if self.kind == 'script': - files.append(self.exec_script) - if self.kind == 'function': - files.append(self.function) - return files + ############################## + # Minor management methods # + ############################## - @property - def runtime(self): - """Return the runtime.""" - if not self.done: - _logme.log('Cannot get runtime as not yet complete.' 'warn') - return None - if not self.start: - self.get_times() - return self.end-self.start - - @property - def done(self): - """Check if completed or not. + def get_keywords(self): + """Return a list of the keyword arguments used to make the job.""" + return self.kwds - Updates the Job and Queue. + def set_keywords(self, kwds, replace=False): + """Set the job keywords, just updates self.kwds. - Returns: - Bool: True if complete, False otherwise. + Attributes: + kwds (dict): Set of valid arguments. + replace (bool): Overwrite the keword arguments instead of updating. """ - # We have the same statement twice to try and avoid updating. - if self.state in _queue.DONE_STATES: - return True - if not self._updating: - self.update() - if self.state in _queue.DONE_STATES: - return True - return False + kwds = _options.check_arguments(kwds) + if replace: + self.kwds = kwds + else: + for key, value in kwds.items(): + self.kwds[key] = value ############### # Internals # @@ -1182,7 +1032,7 @@ def _update(self, fetch_info=True): self.queue_info = queue_info self.state = self.queue_info.state if self.done and fetch_info: - if self.wait_for_files(btme=1, caution_message=False): + if self._wait_for_files(btme=1, caution_message=False): if not self._got_exitcode: self.get_exitcode(update=False) if not self._got_times: @@ -1204,25 +1054,179 @@ def _update(self, fetch_info=True): .format(s), 'warn') self._updating = False - @property - def outfiles(self): - """A list of all outfiles associated with this Job.""" - outfiles = [self.outfile, self.errfile] - if self.poutfile: - outfiles.append(self.poutfile) - return outfiles + def _wait_for_files(self, btme=None, caution_message=False): + """Block until files appear up to 'file_block_time' in config file. - @property - def incomplete_outfiles(self): - """A list of all outfiles that haven't already been fetched.""" - outfiles = [] - if self.outfile and not self._got_stdout: - outfiles.append(self.outfile) - if self.errfile and not self._got_stderr: - outfiles.append(self.errfile) - if self.poutfile and not self._got_out: - outfiles.append(self.poutfile) - return outfiles + Aborts after 2 seconds if job exit code is not 0. + + Args: + btme (int): Number of seconds to try for before giving + up, default set in config file. + caution_message (bool): Display a message if this is taking + a while. + + Returns: + bool: True if files found + """ + if not self.done: + _logme.log("Cannot wait for files if we aren't complete", + 'warn') + return False + wait_time = 0.1 # seconds + if btme: + lvl = 'debug' + else: + lvl = 'warn' + btme = _conf.get_option('jobs', 'file_block_time') + start = _dt.now() + dsp = False + _logme.log('Checking for output files', 'debug') + while True: + runtime = (_dt.now() - start).seconds + if caution_message and runtime > 1: + _logme.log('Job complete.', 'info') + _logme.log('Waiting for output files to appear.', 'info') + caution_message = False + if not dsp and runtime > 20: + _logme.log('Still waiting for output files to appear', + 'info') + dsp = True + count = 0 + outfiles = self.incomplete_outfiles + tlen = len(outfiles) + if not outfiles: + _logme.log('No incomplete outfiles, assuming all found in ' + + '{} seconds'.format(runtime), 'debug') + break + for i in outfiles: + if _os.path.isfile(i): + count += 1 + if count == tlen: + _logme.log('All output files found in {} seconds' + .format(runtime), 'debug') + break + _sleep(wait_time) + if runtime > btme: + _logme.log('Job completed but files have not appeared for ' + + '>{} seconds'.format(btme), lvl) + return False + self.update() + if runtime > 2 and self.get_exitcode(update=False) != 0: + _logme.log('Job failed with exit code {}.' + .format(self.exitcode) + ' Cannot find files.', + 'error') + return False + return True + + def get(self, save=True, cleanup=None, delete_outfiles=None, + del_no_save=None, raise_on_error=True): + """Block until job completed and return output of script/function. + + By default saves all outputs to this class and deletes all intermediate + files. + + Args: + save (bool): Save all outputs to the class also (advised) + cleanup (bool): Clean all intermediate files after job + completes. + delete_outfiles (bool): Clean output files after job completes. + del_no_save (bool): Delete output files even if `save` is + `False` + raise_on_error (bool): If the returned output is an Exception, + raise it. + + Returns: + str: Function output if Function, else STDOUT + """ + _logme.log(('Getting outputs, cleanup={}, autoclean={}, ' + 'delete_outfiles={}').format( + cleanup, self.clean_files, delete_outfiles + ), 'debug') + # Wait for queue + status = self.wait() + if status == 'disappeared': + _logme.log('Job disappeared from queue, attempting to get outputs', + 'debug') + try: + self.fetch_outputs(save=save, delete_files=False, + get_stats=False) + except IOError: + _logme.log('Job disappeared from the queue and files could not' + ' be found, job must have died and been deleted ' + 'from the queue', 'critical') + raise IOError('Job {} disappeared, output files missing' + .format(self)) + elif status is not True: + _logme.log('Wait failed, cannot get outputs, aborting', 'error') + self.update() + if _os.path.isfile(self.errfile): + if _logme.MIN_LEVEL in ['debug', 'verbose']: + _sys.stderr.write('STDERR of Job:\n') + _sys.stderr.write(self.get_stderr(delete_file=False, + update=False)) + if self.poutfile and _os.path.isfile(self.poutfile): + _logme.log('Getting pickled output', 'debug') + self.get_output(delete_file=False, update=False, + raise_on_error=raise_on_error) + else: + _logme.log('Pickled out file does not exist, cannot get error', + 'debug') + return + else: + # Get output + _logme.log('Wait complete, fetching outputs', 'debug') + self.fetch_outputs(save=save, delete_files=False) + out = self.out if save else self.get_output(save=save) + # Cleanup + if cleanup is None: + cleanup = self.clean_files + else: + assert isinstance(cleanup, bool) + if delete_outfiles is None: + delete_outfiles = self.clean_outputs + if save is False: + delete_outfiles = del_no_save if del_no_save is not None else False + if cleanup: + self.clean(delete_outputs=delete_outfiles) + return out + + def _update_name(self, name=None): + """Make sure the job name is unique. + + Attributes: + name (str): A name override, if no provided self.name used + + Returns: + name + + Sets: + self.name + """ + # Set name + name = name if name else self.name + if not name: + if callable(self.command): + strcmd = str(self.command).strip('<>') + parts = strcmd.split(' ') + if parts[0] == 'bound': + name = '_'.join(parts[2:3]) + else: + parts.remove('function') + try: + parts.remove('built-in') + except ValueError: + pass + name = parts[0] + else: + name = self.command.split(' ')[0].split('/')[-1] + + # Make sure name not in queue + if '.' not in name or not name.split('.')[-1] == self.uuid: + name = '{}.{}'.format(name, self.uuid) + + self.name = name + + return name def __getattr__(self, key): """Dynamically get out, stdout, stderr, and exitcode.""" @@ -1255,14 +1259,14 @@ def __repr__(self): def __str__(self): """Print job name and ID + status.""" - if self.done: - state = 'completed' - id1 = str(self.id) - elif self.written: - state = 'written' - id1 = str(self.id) - else: - state = 'not written' - id1 = 'NA' + self.update() return "Job: {name} ID: {id}, state: {state}".format( - name=self.name, id=id1, state=state) + name=self.name, id=self.id, state=self.state) + + def __int__(self): + """Return integer of ID.""" + if self.id: + if str(self.id.isdigit()): + return int(id) + _logme.log('No ID yet.', 'error') + return 0 diff --git a/fyrd/local.py b/fyrd/local.py deleted file mode 100644 index 1a0b5b5..0000000 --- a/fyrd/local.py +++ /dev/null @@ -1,442 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Manage job dependency tracking with multiprocessing. - -Runs jobs with a multiprocessing.Pool, but manages dependency using an -additional Process that loops through all submitted jobs and checks -dependencies before running. - -The JobQueue class works as the queue and functions in a similar, but much more -basic, way as torque or slurm. It manages jobs by forking an instance of the -job_runner function and keeping it alive. The multiprocessing.Queue class is -then used to pass job information contained in the Job class back and forth -between the JobQueue class running in the main process and the job_runner() -fork running as a separate thread. - -The actual job management is done by job_runner() and uses -multiprocessing.Process objects and not the Pool object. This allows for more -careful management and it also allows exit codes to be captured. -""" -import os -import sys -import atexit -import signal -import multiprocessing as mp -from multiprocessing import cpu_count as _cnt -from subprocess import check_output, CalledProcessError -from time import sleep - -from . import run - -# Get defaults -from . import conf - -# Get an Exception object to use -from . import ClusterError - -# Get logging function -from . import logme - -# A global placeholder for a single JobQueue instance -JQUEUE = None - -__all__ = ['JobQueue'] - -################################ -# Normal Mode Multithreading # -################################ - -THREADS = _cnt() - -# Reset broken multithreading -# Some of the numpy C libraries can break multithreading, this command -# fixes the issue. -try: - check_output("taskset -p 0xff %d >/dev/null 2>/dev/null" % os.getpid(), - shell=True) -except CalledProcessError: - pass # This doesn't work on Macs or Windows - - -############################################################################### -# The JobQueue Class to Manage Jobs # -############################################################################### - - -class JobQueue(object): - - """Monitor and submit multiprocessing.Pool jobs with dependencies.""" - - def __init__(self, cores=None): - """Spawn a job_runner process to interact with.""" - self._jobqueue = mp.Queue() - self._outputs = mp.Queue() - self.jobno = int(conf.get_option('jobqueue', 'jobno', '1')) - self.cores = int(cores) if cores else THREADS - self.runner = mp.Process(target=job_runner, - args=(self._jobqueue, - self._outputs, - self.cores, - self.jobnumber), - name='Runner') - self.runner.start() - self.pid = self.runner.pid - assert self.runner.is_alive() - self.jobs = {} - - def terminate(): - """Kill the queue runner.""" - try: - self.runner.terminate() - self._jobqueue.close() - self._outputs.close() - except AttributeError: - pass - if run.check_pid(self.runner.pid): - os.kill(self.runner.pid, signal.SIGKILL) - - # Call terminate when we exit - atexit.register(terminate) - - def update(self): - """Get fresh job info from the runner.""" - sleep(0.5) # This allows the queue time to flush - if self.runner.is_alive() is not True: - self.restart(True) - if self.runner.is_alive() is not True: - raise ClusterError('JobRunner has crashed') - while not self._outputs.empty(): - # We loop through the whole queue stack, updating the dictionary - # every time so that we get the latest info - self.jobs.update(self._outputs.get_nowait()) - if self.jobs: - self.jobno = max(self.jobs.keys()) - conf.set_option('jobqueue', 'jobno', str(self.jobno)) - - def add(self, function, args=None, kwargs=None, dependencies=None, - cores=1): - """Add function to local job queue. - - Args: - function: A function object. To run a command, use the run.cmd - function here. - args: A tuple of args to submit to the function. - kwargs: A dict of keyword arguments to submit to the function. - dependencies: A list of job IDs that this job will depend on. - cores: The number of threads required by this job. - - Returns: - int: A job ID - """ - self.update() - assert self.runner.is_alive() - oldjob = self.jobno - cores = int(cores) - if cores > self.cores: - logme.log('Job core request exceeds resources, limiting to max: ' + - '{}'.format(self.cores), 'warn') - cores = self.cores - self._jobqueue.put(Job(function, args, kwargs, dependencies, - cores)) - sleep(0.5) - self.update() - newjob = self.jobno - # Sometimes the queue can freeze for reasons I don't understand, this - # is an attempted workaround. - if not newjob == oldjob + 1: - self.restart(True) - self._jobqueue.put(Job(function, args, kwargs, dependencies, - cores)) - self.update() - newjob = self.jobno - if not newjob == oldjob + 1: - raise ClusterError('Job numbers are not updating correctly, the ' - 'local queue has probably crashed. Please ' - 'report this issue.') - return self.jobno - - def wait(self, jobs=None): - """Wait for a list of jobs, all jobs are the default.""" - self.update() - if not isinstance(jobs, (list, tuple)): - jobs = [jobs] - while jobs: - for job in jobs: - if job not in self.jobs: - raise ClusterError('Job {} has not been submitted.'.format(job)) - if self.jobs[job].state == 'done': - jobs.remove(job) - sleep(0.5) - - def get(self, job): - """Return the output of a single job""" - if job not in self.jobs: - raise ClusterError('Job {} has not been submitted.'.format(job)) - self.wait(job) - return self.jobs[job].out - - def restart(self, force=False): - """Kill the job queue and restart it.""" - if not force: - self.update() - if len(self.done) != len(self.jobs): - logme.log('Cannot restart, incomplete jobs', 'error') - return - self.runner.terminate() - self.runner = mp.Process(target=job_runner, - args=(self._jobqueue, self._outputs, - self.cores, self.jobnumber), - name='Runner') - self.runner.start() - self.pid = self.runner.pid - assert self.runner.is_alive() - - def __getattr__(self, attr): - """Dynamic dictionary filtering.""" - if attr == 'done' or attr == 'queued' or attr == 'waiting' \ - or attr == 'running': - newdict = {} - for jobid, job_info in self.jobs.items(): - if job_info.state == attr: - newdict[jobid] = job_info - return newdict - - def __getitem__(self, key): - """Allow direct accessing of jobs by job id.""" - self.update() - key = int(key) - try: - return self.jobs[key] - except KeyError: - return None - - def __iter__(self): - """Allow us to be iterable""" - self.update() - for jobno, job in self.jobs.items(): - yield jobno, job - - def __len__(self): - """Length is the total job count.""" - self.update() - return len(self.jobs) - - def __repr__(self): - """Class information.""" - self.update() - return ("JobQueue<({})jobs:{};completed:{};running:{};queued:{}>" - .format(self.cores, len(self.jobs), len(self.done), - len(self.running), - len(self.waiting) + len(self.queued))) - - def __str__(self): - """Print jobs.""" - self.update() - return str(self.jobs) - -class Job(object): - - """An object to pass arguments to the runner.""" - - def __init__(self, function, args=None, kwargs=None, depends=None, - cores=1): - """Parse and save arguments.""" - if args and not isinstance(args, tuple): - args = (args,) - if kwargs and not isinstance(kwargs, dict): - raise TypeError('kwargs must be a dict') - if depends: - if not isinstance(depends, (tuple, list)): - depends = [depends] - try: - depends = [int(i) for i in depends] - except ValueError: - raise ValueError('dependencies must be integer job ids') - self.function = function - self.args = args - self.kwargs = kwargs - self.depends = depends - self.cores = int(cores) - - # Assigned later - self.id = None - self.pid = None - self.exitcode = None - self.out = None - self.state = 'Not Submitted' - - def __repr__(self): - """Job Info.""" - return ("Job<{} (function:{},args:{}," + - "kwargs:{};cores:{}) {}>").format( - self.id, self.function.__name__, self.args, - self.kwargs, self.cores, self.state) - - def __str__(self): - """Print Info and Output.""" - outstr = "Job #{}; Cores: {}\n".format( - self.id if self.id else 'NA', self.cores) - outstr += "\tFunction: {}\n\targs: {}\n\tkwargs: {}\n\t".format( - self.function.__name__, self.args, self.kwargs) - outstr += "State: {}\n\tOutput: {}\n".format(self.state, self.out) - return outstr - - -############################################################################### -# The Job Runner that will fork and run all jobs # -############################################################################### - - -def job_runner(jobqueue, outputs, cores=None, jobno=None): - """Run jobs with dependency tracking. - - Must be run as a separate multiprocessing.Process to function correctly. - - Args: - jobqueue: A multiprocessing.Queue object into which Job objects must be - added. The function continually searches this Queue for new - jobs. Note, function must be a function call, it cannot be - anything else. function is the only required argument, the - rest are optional. tuples are required. - outputs: A multiprocessing.Queue object that will take outputs. A - dictionary of job objects will be output here with the - format:: {job_no => Job} - **NOTE**: function return must be picklable otherwise this - will raise an exception when it is put into the Queue object. - cores: Number of cores to use in the multiprocessing pool. Defaults - to all. - jobno: What number to start counting jobs from, default 1. - """ - - def output(out): - """Explicitly clear the dictionary before sending the output.""" - # Don't send output if it is the same as last time. - lastout = outputs.get() if not outputs.empty() else '' - if out == lastout: - return - while not outputs.empty(): - # Clear the output object - outputs.get() - outputs.put(out) - - # Make sure we have Queue objects - if not isinstance(jobqueue, mp.queues.Queue) \ - or not isinstance(outputs, mp.queues.Queue): - raise ClusterError('jobqueue and outputs must be multiprocessing ' + - 'Queue objects') - - # Initialize job objects - jobno = int(jobno) if jobno \ - else int(conf.get_option('jobqueue', 'jobno', str(1))) - jobs = {} # This will hold job numbers - started = [] # A list of started jobs to check against - cores = cores if cores else THREADS - queue = [] # This will hold Processes that haven't started yet - running = [] # This will hold actively running jobs to manage core count - done = [] # A list of completed jobs to check against - - # Actually loop through the jobs - while True: - if not jobqueue.empty(): - oldjob = jobno - jobno += 1 - newjob = jobno - # Sometimes the thread stalls if it has been left a while and - # ends up reusing the same job number. I don't know why this - # happens, however explicitly getting the jobno seems to fix the - # issue. Just to be sure I also want to test that the job number is - # incremented, but this is a little redundant at this point. - assert newjob == oldjob + 1 - job = jobqueue.get_nowait() - if not isinstance(job, Job): - logme.log('job information must be a job object, was {}'.format( - type(job)), 'error') - continue - - # The arguments look good, so lets add this to the stack. - job.state = 'submitted' - job.id = jobno - jobs[jobno] = job - - # Send the job dictionary - output(jobs) - - # If there are jobs, try and run them - if jobs: - for jobno, job_info in jobs.items(): - # Skip completed jobs - if job_info.state == 'done': - continue - - # Check dependencies - ready = True - if job_info.depends: - for depend in job_info.depends: - if int(depend) not in done: - ready = False - job_info.state = 'waiting' - output(jobs) - - # Start jobs if dependencies are met and they aren't started. - # We use daemon mode so that child jobs are killed on exit. - if ready and not jobno in started: - ver = sys.version_info.major - # Python 2 doesn't support daemon, even though the docs - # say that it does. - gen_args = dict(name=str(jobno)) if ver == 2 \ - else dict(name=str(jobno), daemon=True) - if job_info.args and job_info.kwargs: - queue.append((mp.Process(target=job_info.function, - args=job_info.args, - kwargs=job_info.kwargs, - **gen_args), - job_info.cores)) - elif job_info.args: - queue.append((mp.Process(target=job_info.function, - args=job_info.args, - **gen_args), - job_info.cores)) - elif job_info.kwargs: - queue.append((mp.Process(target=job_info.function, - kwargs=job_info.kwargs, - **gen_args), - job_info.cores)) - else: - queue.append((mp.Process(target=job_info.function, - **gen_args), - job_info.cores)) - job_info.state = 'queued' - started.append(jobno) - output(jobs) - - # Actually run jobs - if queue: - # Get currently used cores, ignore ourself - running_cores = 0 - for i in [i[1] for i in running]: - running_cores += i - # Look for a job to run - for j in queue: - if j[1] + running_cores <= cores: - j[0].start() - jobs[int(j[0].name)].state = 'running' - jobs[int(j[0].name)].pid = j[0].pid - running.append(queue.pop(queue.index(j))) - output(jobs) - sleep(0.5) # Wait for a second to allow job to start - break - - # Clean out running jobs - if running: - for i in running: - j = i[0] - if not j.is_alive(): - jobs[int(j.name)].out = j.join() - jobs[int(j.name)].state = 'done' - jobs[int(j.name)].exitcode = j.exitcode - done.append(int(j.name)) - running.pop(running.index(i)) - output(jobs) - - # Wait for half a second before looping again - sleep(0.5) diff --git a/fyrd/queue.py b/fyrd/queue.py index f489a8b..7fc6864 100644 --- a/fyrd/queue.py +++ b/fyrd/queue.py @@ -1,8 +1,8 @@ -#job -*- coding: utf-8 -*- +# -*- coding: utf-8 -*- """ Monitor the queue for torque or slurm. -Provides a class to monitor the torque, slurm, or local queues with identical +Provides a class to monitor the all defined batch queues with identical syntax. At its simplest, you can use it like:: @@ -25,59 +25,28 @@ The default cluster environment is also defined in this file as MODE, it can be set directly or with the get_cluster_environment() function definied here. """ -import re -import sys -import pwd # Used to get usernames for queue -import socket # Used to get the hostname -import getpass # Used to get usernames for queue +import pwd as _pwd # Used to get usernames for queue +import getpass as _getpass # Used to get usernames for queue from datetime import datetime as _dt -from time import time, sleep -from subprocess import check_output, CalledProcessError - -# For parsing torque queues -import xml.etree.ElementTree as ET +from time import time as _time +from time import sleep as _sleep ############################################################################### # Our functions # ############################################################################### -from . import run -from . import logme -from . import conf -from . import ClusterError - -######################### -# Which system to use # -######################### - -from . import ALLOWED_MODES - -######################################################### -# The multiprocessing pool, only used in 'local' mode # -######################################################### - -from . import local +from . import run as _run +from . import logme as _logme +from . import conf as _conf +from . import ClusterError as _ClusterError +from . import batch_systems as _batch # Funtions to import if requested -__all__ = ['Queue', 'wait', 'check_queue', 'get_cluster_environment'] +__all__ = ['Queue'] # We only need the queue defaults -_defaults = conf.get_option('queue') - -# This is set in the get_cluster_environment() function. -MODE = '' - -# Define torque-to-slurm mappings -TORQUE_SLURM_STATES = { - 'C': 'completed', - 'E': 'completing', - 'H': 'held', # Not a SLURM state - 'Q': 'pending', - 'R': 'running', - 'T': 'suspended', - 'W': 'running', - 'S': 'suspended', -} +_defaults = _conf.get_option('queue') + # Define job states GOOD_STATES = ['complete', 'completed', 'special_exit'] @@ -97,7 +66,7 @@ class Queue(object): - """A wrapper for torque, slurm, or local queues. + """A wrapper for all defined batch systems. Attributes: jobs (dict) A dictionary of all jobs in this queue @@ -118,14 +87,14 @@ def __init__(self, user=None, partition=None, qtype=None,): If user='self' or 'current', the current user will be used. partition (str): Optional partition to filter the queue with. - qtype (str): 'torque', 'slurm', or 'local', defaults to auto-detect. + qtype (str): one of the defined batch queues (e.g. 'slurm') """ # Get user ID as an int UID if user: if user == 'self' or user == 'current': - self.user = getpass.getuser() + self.user = _getpass.getuser() """The username if defined.""" - self.uid = pwd.getpwnam(self.user).pw_uid + self.uid = _pwd.getpwnam(self.user).pw_uid elif user == 'ALL': self.user = None else: @@ -133,33 +102,32 @@ def __init__(self, user=None, partition=None, qtype=None,): or (isinstance(user, str) and user.isdigit()): self.uid = int(user) else: - self.uid = pwd.getpwnam(str(user)).pw_uid + self.uid = _pwd.getpwnam(str(user)).pw_uid else: self.uid = None - self.user = pwd.getpwuid(self.uid).pw_name if self.uid else None + self.user = _pwd.getpwuid(self.uid).pw_name if self.uid else None self.partition = partition """The partition if defined.""" # Set queue length - self.max_jobs = conf.get_option('queue', 'max_jobs') + self.max_jobs = _conf.get_option('queue', 'max_jobs') """The maximum number of jobs that can run in this queue.""" # Support python2, which hates reciprocal import from .job import Job - from .local import Job as QJob self._Job = Job - self._JobQueue = QJob # Get sleep time and update time - self.queue_update_time = conf.get_option('queue', 'queue_update', 2) - self.sleep_len = conf.get_option('queue', 'sleep_len', 2) + self.queue_update_time = _conf.get_option('queue', 'queue_update', 2) + self.sleep_len = _conf.get_option('queue', 'sleep_len', 2) # Set type if qtype: - check_queue(qtype) + _batch.check_queue(qtype) else: - check_queue() - self.qtype = qtype if qtype else MODE + _batch.check_queue() + self.qtype = qtype if qtype else _batch.MODE + self.batch_system = _batch.get_batch_system(self.qtype) # Allow tracking of updates to prevent too many updates self._updating = False @@ -185,8 +153,8 @@ def check_dependencies(self, dependencies): completed, 'bad' if failed, cancelled, or suspended, 'absent' otherwise. """ - for dep in run.listify(dependencies): - dep = int(dep) + for dep in _run.listify(dependencies): + dep = str(dep) if dep not in self.jobs: return 'absent' state = self.jobs[dep].state @@ -217,122 +185,179 @@ def wait(self, jobs): True on success False or None on failure. """ self.update() - logme.log('Queue waiting.', 'debug') + _logme.log('Queue waiting.', 'debug') # Sanitize arguments + jobs = _run.listify(jobs) if not isinstance(jobs, (list, tuple)): jobs = [jobs] for job in jobs: - if not isinstance(job, (str, int, self.QueueJob, self._Job, - self._JobQueue)): - raise ClusterError('job must be int, string, or Job, ' + - 'is {}'.format(type(job))) + if not isinstance(job, (str, int, QueueJob, self._Job)): + raise _ClusterError('job must be int, string, or Job, ' + + 'is {}'.format(type(job))) # Wait for 0.1 second before checking, as jobs take a while to be # queued sometimes - sleep(0.1) + _sleep(0.1) + pbar = _run.get_pbar(jobs, name="Waiting for job completion", + unit='jobs') for job in jobs: - logme.log('Checking {}'.format(job), 'debug') - qtype = job.qtype if isinstance(job, self._Job) else self.qtype - if isinstance(job, (self._Job, self._JobQueue, self.QueueJob)): + _logme.log('Checking {}'.format(job), 'debug') + if isinstance(job, (self._Job, QueueJob)): job = job.id - if qtype == 'local': - logme.log('Job is in local queue', 'debug') - try: - job = int(job) - except TypeError: - raise TypeError('Job must be a Job object or job #.') - if not local.JQUEUE \ - or not local.JQUEUE.runner.is_alive(): - raise ClusterError('Cannot wait on job ' + str(job) + - 'JobQueue does not exist') - local.JQUEUE.wait(job) - else: - logme.log('Job is in remote queue', 'debug') - if isinstance(job, self._Job): - job = job.id - job = int(job) - not_found = 0 - lgd = False - while True: - self._update() - # Allow 12 seconds to elapse before job is found in queue, - # if it is not in the queue by then, assume completion. - if job not in self.jobs: - if lgd: - logme.log('Attempt #{}/12'.format(not_found), - 'debug') - else: - logme.log('{} not in queue, waiting up to 12s ' - .format(job) + - 'for it to appear', 'info') - lgd = True - sleep(1) - not_found += 1 - if not_found == 12: - logme.log( - '{} not in queue, tried 12 times over 12s' - .format(job) + '. Job likely completed, ' + - 'assuming completion, stats will be ' + - 'unavailable.','warn' - ) - return 'disappeared' - continue - ## Actually look for job in running/queued queues - lgd = False - lgd2 = False - start = _dt.now() - res_time = conf.get_option('queue', 'res_time') - count = 0 - # Get job state - job_state = self.jobs[job].state - # Check the state - if job_state in GOOD_STATES: - logme.log('Queue wait for {} complete' - .format(job), 'debug') - sleep(0.1) - break - elif job_state in ACTIVE_STATES: - if lgd: - logme.log('{} not complete yet, waiting' - .format(job), 'debug') - lgd = True - else: - logme.log('{} still not complete, waiting' - .format(job), 'verbose') - sleep(self.sleep_len) - elif job_state in BAD_STATES: - logme.log('Job {} failed with state {}' - .format(job, job_state), 'error') - return False - elif job_state in UNCERTAIN_STATES: - if not lgd2: - logme.log('Job {} in state {}, waiting {} ' - .format(job, job_state, res_time) + - 'seconds for resolution', 'warn') - lgd2 = True - if (_dt.now() - start).seconds > res_time: - logme.log('Job {} still in state {}, aborting' - .format(job, job_state), 'error') - return False - sleep(self.sleep_len) + if isinstance(job, self._Job): + job = job.id + job, _ = self.batch_system.normalize_job_id(job) + lgd = False + while True: + self._update() + # Allow 12 seconds to elapse before job is found in queue, + # if it is not in the queue by then, assume completion. + if not self.test_job_in_queue(job): + break + ## Actually look for job in running/queued queues + lgd = False + lgd2 = False + start = _dt.now() + res_time = _conf.get_option('queue', 'res_time') + count = 0 + # Get job state + job_state = self.jobs[job].state + # Check the state + if job_state in GOOD_STATES: + _logme.log('Queue wait for {} complete' + .format(job), 'debug') + _sleep(0.1) + pbar.update() + break + elif job_state in ACTIVE_STATES: + if lgd: + _logme.log('{} not complete yet, waiting' + .format(job), 'debug') + lgd = True else: - if count == 5: - logme.log('Job {} in unknown state {} ' - .format(job, job_state) + - 'cannot continue', 'critical') - raise QueueError('Unknown job state {}' - .format(job_state)) - logme.log('Job {} in unknown state {} ' - .format(job, job_state) + - 'trying to resolve', 'debug') - count += 1 - sleep(self.sleep_len) + _logme.log('{} still not complete, waiting' + .format(job), 'verbose') + _sleep(self.sleep_len) + elif job_state in BAD_STATES: + _logme.log('Job {} failed with state {}' + .format(job, job_state), 'error') + return False + elif job_state in UNCERTAIN_STATES: + if not lgd2: + _logme.log('Job {} in state {}, waiting {} ' + .format(job, job_state, res_time) + + 'seconds for resolution', 'warn') + lgd2 = True + if (_dt.now() - start).seconds > res_time: + _logme.log('Job {} still in state {}, aborting' + .format(job, job_state), 'error') + return False + _sleep(self.sleep_len) + else: + if count == 5: + _logme.log('Job {} in unknown state {} ' + .format(job, job_state) + + 'cannot continue', 'critical') + raise QueueError('Unknown job state {}' + .format(job_state)) + _logme.log('Job {} in unknown state {} ' + .format(job, job_state) + + 'trying to resolve', 'debug') + count += 1 + _sleep(self.sleep_len) # Sleep an extra half second to allow post-run scripts to run - sleep(0.5) + _sleep(0.5) return True + def get(self, jobs): + """Get all results from a bunch of Job objects. + + Attributes: + jobs (list): List of fyrd.Job objects + + Returns: + job_results (dict): {job_id: Job} + + Raises: + fyrd.ClusterError if any job fails or goes missing. + """ + self.update() + _logme.log('Queue waiting.', 'debug') + singular = True if isinstance(jobs, self._Job) else False + + # Force into enumerated list to preserve order + jobs = dict(enumerate(_run.listify(jobs))) + done = {} + + # Check that all jobs are valid + for job in jobs.values(): + if not isinstance(job, self._Job): + raise _ClusterError('This only works with cluster job ' + 'objects') + + # Loop through all jobs continuously trying to get outputs + pbar = _run.get_pbar(jobs, name="Getting Job Results", unit='jobs') + while jobs: + for i in list(jobs): + job = jobs[i] + job.update() + if not self.test_job_in_queue(job.id): + raise _ClusterError('Job {} not queued'.format(job.id)) + if job.state == 'completed': + done[i] = jobs.pop(i).get() + pbar.update() + elif job.state in BAD_STATES: + pbar.close() + raise _ClusterError('Job {} failed, cannot get output' + .format(job.id)) + # Block between attempts + _sleep(self.sleep_len) + pbar.write('Done\n') + pbar.close() + + # Correct the order, make it the same as the input list + results = [] + for i in sorted(done.keys()): + results.append(done[i]) + return results[0] if singular else results + + def test_job_in_queue(self, job): + """Check to make sure job is in self. + + Tries 12 times with 1 second between each. If found returns True, + else False. + """ + lgd = False + not_found = 0 + while True: + self._update() + # Allow 12 seconds to elapse before job is found in queue, + # if it is not in the queue by then, assume completion. + if job in self.jobs: + return True + else: + if lgd: + _logme.log('Attempt #{}/12'.format(not_found), + 'debug') + else: + _logme.log('{} not in queue, waiting up to 12s ' + .format(job) + + 'for it to appear', 'info') + lgd = True + _sleep(1) + not_found += 1 + if not_found == 12: + _logme.log( + '{} not in queue, tried 12 times over 12s' + .format(job) + '. Job likely completed, ' + + 'assuming completion, stats will be ' + + 'unavailable.','warn' + ) + return False + continue + def wait_to_submit(self, max_jobs=None): """Block until fewer running/queued jobs in queue than max_jobs. @@ -347,32 +372,33 @@ def wait_to_submit(self, max_jobs=None): if self.can_submit: return if not written: - logme.log(('The queue is full, there are {} jobs running and ' - '{} jobs queued. Will wait to submit, retrying ' - 'every {} seconds.') - .format(len(self.running), len(self.queued), - self.sleep_len), - 'info') + _logme.log(('The queue is full, there are {} jobs running and ' + '{} jobs queued. Will wait to submit, retrying ' + 'every {} seconds.') + .format(len(self.running), len(self.queued), + self.sleep_len), + 'info') written = True if count == 0: - logme.log('Still waiting to submit.', 'info') + _logme.log('Still waiting to submit.', 'info') count = 50 count -= 1 - sleep(self.sleep_len) + _sleep(self.sleep_len) def update(self): """Refresh the list of jobs from the server, limit queries.""" - if int(time()) - self.last_update > self.queue_update_time: + if int(_time()) - self.last_update > self.queue_update_time: self._update() else: - logme.log('Skipping update as last update too recent', 'debug') + _logme.log('Skipping update as last update too recent', 'debug') return self def get_jobs(self, key): """Return a dict of jobs where state matches key.""" retjobs = {} + keys = [k.lower() for k in _run.listify(key)] for jobid, job in self.jobs.items(): - if job.state == key.lower(): + if job.get_state in keys: retjobs[jobid] = job return retjobs @@ -397,8 +423,8 @@ def get_user_jobs(self, users): @property def users(self): - """Return a list of users with jobs running.""" - return [job.owner for job in self.jobs.values()] + """Return a set of users with jobs running.""" + return set([job.owner for job in self.jobs.values()]) @property def job_states(self): @@ -419,12 +445,11 @@ def bad(self): @property def active_job_count(self): - """Return a count of all queued or running jobs.""" - return sum([ - len(j) for j in [ - self.get_jobs(i) for i in ACTIVE_STATES - ] - ]) + """Return a count of all queued or running jobs, inc. array jobs.""" + jobcount = 0 + for j in self.get_jobs(ACTIVE_STATES): + jobcount += j.jobcount() + return jobcount @property def can_submit(self): @@ -446,87 +471,71 @@ def _update(self): """ if self._updating: return - logme.log('Queue updating', 'debug') + _logme.log('Queue updating', 'debug') # Set the update time I don't care about microseconds - self.last_update = int(time()) + self.last_update = int(_time()) jobs = [] # list of jobs created this session + for [job_id, array_id, job_name, job_user, job_partition, + job_state, job_nodelist, job_nodecount, + job_cpus, job_exitcode] in self.batch_system.queue_parser( + self.user, self.partition): + if job_nodecount and job_cpus: + job_threads = int(job_nodecount) * int(job_cpus) + else: + job_threads = None + if job_state == 'completed' or job_state == 'failed': + job_exitcode = job_exitcode + else: + job_exitcode = None - # Mode specific initialization - if self.qtype == 'local': - if not local.JQUEUE or not local.JQUEUE.runner.is_alive(): - local.JQUEUE = local.JobQueue(cores=local.THREADS) - for job_id, job_info in local.JQUEUE: - if job_id in self.jobs: - job = self.jobs[job_id] - else: - job = self.QueueJob() - job.id = job_id - job.name = job_info.function.__name__ - job.owner = self.user - self.nodes = socket.gethostname() - if job_info.state == 'Not Submitted': - job.state = 'pending' - elif job_info.state == 'waiting' \ - or job_info.state == 'submitted': - job.state = 'pending' - elif job_info.state == 'started' \ - or job_info.state == 'running': - job.state = 'running' - elif job_info.state == 'done': - job.state = 'completed' - job.exitcode = int(job_info.exitcode) - elif job_info.state == 'queued': - job.state = 'pending' - else: - raise Exception('Unrecognized state') - - # Assign the job to self. - self.jobs[job_id] = job - jobs.append(job_id) - - else: - for [job_id, job_name, job_user, job_partition, - job_state, job_nodelist, job_nodecount, - job_cpus, job_exitcode] in queue_parser(self.qtype, - self.user, - self.partition): - if job_id not in self.jobs: - job = self.QueueJob() - else: - job = self.jobs[job_id] - jobs.append(job_id) - job.id = job_id - job.name = job_name - job.owner = job_user - job.queue = job_partition - job.state = job_state.lower() - job.nodes = job_nodelist - - # Threads is number of nodes * jobs per node - if job_nodecount and job_cpus: - job.threads = int(job_nodecount) * int(job_cpus) - else: - job.threads = None - if job.state == 'completed' or job.state == 'failed': - job.exitcode = job_exitcode + # Get/Create the QueueJob object + if job_id not in self.jobs: + job = QueueJob() + else: + job = self.jobs[job_id] + + job.id = job_id + job.name = job_name + job.owner = job_user + job.queue = job_partition + job.state = job_state.lower() + + if array_id is not None: + job.array_job = True + cjob = QueueChild(job) + cjob.id = array_id + cjob.name = job_name + cjob.owner = job_user + cjob.queue = job_partition + cjob.state = job_state.lower() + cjob.nodes = job_nodelist + cjob.threads = job_threads + cjob.exitcode = job_exitcode + job.children[array_id] = cjob + job.state = job.get_state() + job.nodes = job.get_nodelist() + job.threads = job.get_threads() + job.exitcode = job.get_exitcode() + else: + job.nodes = job_nodelist + job.threads = job_threads + job.exitcode = job_exitcode - # Assign the job to self. - self.jobs[job_id] = job - jobs.append(job_id) + # Assign the job to self. + self.jobs[job_id] = job + jobs.append(str(job_id)) # We assume that if a job just disappeared it completed if self.jobs: for qjob in self.jobs.values(): - if qjob.id not in jobs: + if str(qjob.id) not in jobs: qjob.state = 'completed' qjob.disappeared = True def __getattr__(self, key): """Make running and queued attributes dynamic.""" - key = key.lower() - if key in TORQUE_SLURM_STATES: - key = TORQUE_SLURM_STATES[key] + key = self.batch_system.normalize_state(key.lower()) if key == 'complete': key = 'completed' elif key == 'queued': @@ -537,8 +546,8 @@ def __getattr__(self, key): def __getitem__(self, key): """Allow direct accessing of jobs by job id.""" if isinstance(key, self._Job): - key = key.jobid - key = int(key) + key = key.id + key = str(key) try: return self.jobs[key] except KeyError: @@ -569,491 +578,202 @@ def __str__(self): """A list of keys.""" return str(self.jobs.keys()) - ############################################## - # A simple class to hold jobs in the queue # - ############################################## - - class QueueJob(object): - - """A very simple class to store info about jobs in the queue. - - Only used for torque and slurm queues. - - Attributes: - id (int): Job ID - name (str): Job name - owner (str): User who owns the job - threads (int): Number of cores used by the job - queue (str): The queue/partition the job is running in - state (str): Current state of the job, normalized to slurm - states - nodes (list): List of nodes job is running on - exitcode (int): Exit code of completed job - disappeared (bool): Job cannot be found in the queue anymore - """ - - id = None - name = None - owner = None - threads = None - queue = None - state = None - nodes = None - exitcode = None - disappeared = False - - def __init__(self): - """No initialization needed all attributes are set elsewhere.""" - pass - - def __repr__(self): - """Show all info.""" - outstr = ("Queue.QueueJob<{id}:{state}({name},owner:{owner}," + - "queue:{queue},nodes:{nodes},threads:{threads}," + - "exitcode:{code})").format( - id=self.id, name=self.name, owner=self.owner, - queue=self.queue, nodes=self.nodes, - code=self.exitcode, threads=self.threads, - state=self.state) - if self.disappeared: - outstr += 'DISAPPEARED>' - else: - outstr += '>' - return outstr - - def __str__(self): - """Print job ID.""" - return str(self.id) - -################ -# Exceptions # -################ - -class QueueError(Exception): - - """Simple Exception wrapper.""" - - pass +############################################## +# A simple class to hold jobs in the queue # +############################################## -############################################################################### -# Non-Class Functions # -############################################################################### +class _QueueJob(object): -################### -# Queue Parsers # -################### + """A very simple class to store info about jobs in the queue. -def queue_parser(qtype=None, user=None, partition=None): - """Call either torque or slurm qtype parsers depending on qtype. - - Args: - qtype: Either 'torque' or 'slurm', defaults to current MODE - user: optional user name to pass to queue to filter queue with - - Yields: - tuple: job_id, name, userid, partition, state, nodelist, numnodes, - ntpernode, exit_code + Attributes: + id (int): Job ID + name (str): Job name + children (dict): If array job, list of child job numbers + owner (str): User who owns the job + threads (int): Number of cores used by the job + queue (str): The queue/partition the job is running in + state (str): Current state of the job, normalized to slurm + states + nodes (list): List of nodes job is running on + exitcode (int): Exit code of completed job + disappeared (bool): Job cannot be found in the queue anymore """ - if not qtype: - qtype = get_cluster_environment() - if qtype == 'torque': - return torque_queue_parser(user, partition) - elif qtype == 'slurm': - return slurm_queue_parser(user, partition) - else: - raise ClusterError("Invalid qtype type {}, must be 'torque' or 'slurm'" - .format(qtype)) + id = None + name = None + owner = None + threads = None + queue = None + state = None + nodes = None + exitcode = None + disappeared = False + array_job = False + children = {} + parent = None + _child_job = False + _cname = None + + def get_state(self): + """return the current state of the job.""" + if self.array_job: + pending_jobs = False + running_jobs = False + failed_jobs = False + for job_info in self.children.values(): + if job_info.state == 'pending': + pending_jobs = True + elif job_info.state in ACTIVE_STATES: + running_jobs = True + elif job_info.state in BAD_STATES: + failed_jobs = True + if running_jobs: + return 'running' + if pending_jobs: + return 'pending' + if failed_jobs: + return 'failed' + return 'completed' + return self.state + + def get_nodelist(self): + """return the current state of the job.""" + if self.array_job: + nodelist = [] + for job_info in self.children.values(): + if job_info.nodes: + nodelist = nodelist + job_info.nodes + return nodelist if nodelist else None + return self.nodes + + def get_threads(self, state=None): + """Return a count of how many running jobs we have.""" + states = [i.lower() for i in _run.listify(state)] + if self.array_job: + if state: + return sum([j.threads for j in self.children if j.state in states]) + return len(self.children) + if state: + return self.threads if self.state in states else 0 + return self.threads + + def get_exitcode(self): + """Return sum of exitcodes for all completed jobs.""" + if self.array_job: + code = 0 + some_done = False + for child in self.children: + if child.state in DONE_STATES: + some_done = True + code += child.exitcode + if some_done: + return code + return None + return self.exitcode + + def jobcount(self, state=None): + """Return a count of how many running jobs we have.""" + states = [i.lower() for i in _run.listify(state)] + if self.array_job: + if state: + return len([j for j in self.children if j.state in states]) + return len(self.children) + if state: + return 1 if self.state in states else 0 + return 1 -def torque_queue_parser(user=None, partition=None): - """Iterator for torque queues. - - Use the `qstat -x` command to get an XML queue for compatibility. - - Args: - user: optional user name to pass to qstat to filter queue with - partiton: optional partition to filter the queue with - - Yields: - tuple: job_id, name, userid, partition, state, nodelist, numnodes, - ntpernode, exit_code - - numcpus is currently always 1 as most torque queues treat every core as a - node. - """ - # I am not using run.cmd because I want to catch XML errors also - try_count = 0 - qargs = ['qstat', '-x'] - while True: - try: - xmlqueue = ET.fromstring(check_output(qargs)) - except CalledProcessError: - sleep(1) - if try_count == 5: - raise - else: - try_count += 1 - except ET.ParseError: - # ElementTree throws error when string is empty - sleep(1) - if try_count == 1: - xmlqueue = None - break + def __repr__(self): + """Show all info.""" + if not self._child_job: + if self.array_job: + children = len(self.children) if self.children else None + child_str = ':children:{0}'.format(children) else: - try_count += 1 + child_str = '' else: - break - - # Create QueueJob objects for all entries that match user - if xmlqueue is not None: - for xmljob in xmlqueue: - job_id = xmljob.find('Job_Id').text.split('.')[0] - if '[' in job_id: - job_id, array_id = job_id.split('[') - array_id = array_id.strip('[]') - if array_id: - array_id = int(array_id) - else: - array_id = 0 - else: - array_id = None - try: - job_id = int(job_id) - except ValueError: - # Allow string job IDs - pass - - job_owner = xmljob.find('Job_Owner').text.split('@')[0] - if user and job_owner != user: - continue - job_name = xmljob.find('Job_Name').text - job_queue = xmljob.find('queue').text - job_state = xmljob.find('job_state').text - job_state = TORQUE_SLURM_STATES[job_state] - logme.log('Job {} state: {}'.format(job_id, job_state), - 'debug') - ndsx = xmljob.find('exec_host') - if ndsx: - nds = ndsx.text.split('+') - else: - nds = [] - nodes = [] - for node in nds: - if '-' in node: - nm, num = node.split('/') - for i in range(*[int(i) for i in num.split('-')]): - nodes.append(nm + '/' + str(i).zfill(2)) - else: - nodes.append(node) - # I assume that every 'node' is a core, as that is the - # default for torque, but it isn't always true - job_threads = len(nodes) - exitcode = xmljob.find('exit_status') - if hasattr(exitcode, 'text'): - exitcode = int(exitcode.text) - - if partition and job_queue != partition: - continue - yield (job_id, job_name, job_owner, job_queue, job_state, - nodes, job_threads, 1, exitcode) + child_str = ':parent:{0}'.format(self.parent.id) + outstr = ("{cname}<{id}:{state}{child}" + + "({name},owner:{owner}," + + "queue:{queue},nodes:{nodes},threads:{threads}," + + "exitcode:{code})").format( + id=self.id, name=self.name, owner=self.owner, + queue=self.queue, nodes=self.nodes, + code=self.exitcode, threads=self.threads, + state=self.state, child=child_str, cname=self._cname + ) + if self.disappeared: + outstr += 'DISAPPEARED>' + else: + outstr += '>' + return outstr + def __str__(self): + """Print job ID.""" + return str(self.id) -def slurm_queue_parser(user=None, partition=None): - """Iterator for slurm queues. - Use the `squeue -O` command to get standard data across implementation, - supplement this data with the results of `sacct`. sacct returns data only - for the current user but retains a much longer job history. Only jobs not - returned by squeue are added with sacct, and they are added to *the end* of - the returned queue, i.e. *out of order with respect to the actual queue*. +class QueueJob(_QueueJob): - Args: - user: optional user name to filter queue with - partition: optional partition to filter queue with + """A very simple class to store info about jobs in the queue. - Yields: - tuple: job_id, name, userid, partition, state, nodelist, numnodes, - ntpernode, exit_code - """ - nodequery = re.compile(r'([^\[,]+)(\[[^\[]+\])?') - qargs = ['squeue', '-h', '-O', - 'jobid:400,arraytaskid:400,name:400,userid:400,partition:400,' + - 'state:400,nodelist:400,numnodes:400,numcpus:400,exit_code:400'] - # Parse queue info by length - squeue = [ - tuple( - [k[i:i+200].rstrip() for i in range(0, 4000, 400)] - ) for k in run.cmd(qargs)[1].split('\n') - ] - # SLURM sometimes clears the queue extremely fast, so we use sacct - # to get old jobs by the current user - qargs = ['sacct', '-p', - '--format=jobid,jobname,user,partition,state,' + - 'nodelist,reqnodes,ncpus,exitcode'] - try: - sacct = [tuple(i.strip(' |').split('|')) for i in - run.cmd(qargs)[1].split('\n')] - sacct = sacct[1:] - # This command isn't super stable and we don't care that much, so I will - # just let it die no matter what - except Exception as e: - if logme.MIN_LEVEL == 'debug': - raise e - else: - sacct = [] - - if sacct: - if len(sacct[0]) != 9: - logme.log('sacct parsing failed unexpectedly as there are not ' + - '9 columns, aborting.', 'critical') - raise ValueError('sacct output does not have 9 columns. Has:' + - '{}: {}'.format(len(sacct[0]), sacct[0])) - jobids = [i[0] for i in squeue] - for sinfo in sacct: - # Skip job steps, only index whole jobs - if '.' in sinfo[0]: - logme.log('Skipping {} '.format(sinfo[0]) + - "in sacct processing as it is a job part.", - 'verbose') - continue - # These are the values I expect - try: - [sid, sname, suser, spartition, sstate, - snodelist, snodes, scpus, scode] = sinfo - if '_' in sid: - sid, sarr = sid.split('_') - sif = '{}_{}'.format(sid, sarr) - else: - sarr = None - sif = '{}'.format(sid) - except ValueError as err: - logme.log('sacct parsing failed with error {} '.format(err) + - 'due to an incorrect number of entries.\n' + - 'Contents of sinfo:\n{}\n'.format(sinfo) + - 'Expected 9 values\n:' + - '[sid, sname, suser, spartition, sstate, ' + - 'snodelist, snodes, scpus, scode]', - 'critical') - raise - # Skip jobs that were already in squeue - if sid in jobids: - logme.log('{} still in squeue output'.format(sid), - 'verbose') - continue - scode = int(scode.split(':')[-1]) - squeue.append((sid, sarr, sname, suser, spartition, sstate, - snodelist, snodes, scpus, scode)) - else: - logme.log('No job info in sacct', 'debug') - - # Sanitize data - for sinfo in squeue: - if len(sinfo) == 10: - [sid, sarr, sname, suser, spartition, sstate, sndlst, - snodes, scpus, scode] = sinfo - else: - sys.stderr.write('{}'.format(repr(sinfo))) - raise ClusterError('Queue parsing error, expected 10 items ' - 'in output of squeue and sacct, got {}\n' - .format(len(sinfo))) - if partition and spartition != partition: - continue - if not isinstance(sid, int): - sid = int(sid) if sid else None - if isinstance(sarr, str) and sarr.isdigit(): - sarr = int(sarr) - else: - sarr = None - if not isinstance(snodes, int): - snodes = int(snodes) if snodes else None - if not isinstance(scpus, int): - scpus = int(scpus) if snodes else None - if not isinstance(scode, int): - scode = int(scode) if scode else None - # Convert user from ID to name - if suser.isdigit(): - suser = pwd.getpwuid(int(suser)).pw_name - if user and suser != user: - continue - # Attempt to parse nodelist - snodelist = [] - if sndlst: - if nodequery.search(sndlst): - nsplit = nodequery.findall(sndlst) - for nrg in nsplit: - node, rge = nrg - if not rge: - snodelist.append(node) - else: - for reg in rge.strip('[]').split(','): - # Node range - if '-' in reg: - start, end = [int(i) for i in reg.split('-')] - for i in range(start, end): - snodelist.append('{}{}'.format(node, i)) - else: - snodelist.append('{}{}'.format(node, reg)) - else: - snodelist = sndlst.split(',') + Only used for torque and slurm queues. - yield (sid, sname, suser, spartition, sstate, snodelist, - snodes, scpus, scode) + Attributes: + id (int): Job ID + name (str): Job name + owner (str): User who owns the job + threads (int): Number of cores used by the job + queue (str): The queue/partition the job is running in + state (str): Current state of the job, normalized to slurm + states + nodes (list): List of nodes job is running on + exitcode (int): Exit code of completed job + disappeared (bool): Job cannot be found in the queue anymore + array_job (bool): This job is an array job and has children + children (dict): If array job, list of child job numbers + """ + def __init__(self): + """Initialize.""" + self._cname = 'QueueJob' + self.children = {} -########################################################### -# Set the global cluster type: slurm, torque, or local # -########################################################### +class QueueChild(_QueueJob): -def get_cluster_environment(): - """Detect the local cluster environment and set MODE globally. + """A very simple class to store info about child jobs in the queue. - Uses which to search for sbatch first, then qsub. If neither is found, - MODE is set to local. + Only used for torque and slurm queues. - Returns: - tuple: MODE variable ('torque', 'slurm', or 'local') + Attributes: + id (int): Job ID + name (str): Job name + owner (str): User who owns the job + threads (int): Number of cores used by the job + queue (str): The queue/partition the job is running in + state (str): Current state of the job, normalized to slurm + states + nodes (list): List of nodes job is running on + exitcode (int): Exit code of completed job + disappeared (bool): Job cannot be found in the queue anymore + parent (QueueJob): Backref to parent job """ - global MODE - conf_queue = conf.get_option('queue', 'queue_type', 'auto') - if conf_queue not in ['torque', 'slurm', 'local', 'auto']: - logme.log('queue_type in the config file is {}, '.format(conf_queue) + - 'but it should be one of torque, slurm, local, or auto. ' + - 'Resetting it to auto', 'warn') - conf.set_option('queue', 'queue_type', 'auto') - conf_queue = 'auto' - if conf_queue == 'auto': - sbatch_cmnd = conf.get_option('queue', 'sbatch') - qsub_cmnd = conf.get_option('queue', 'qsub') - sbatch_cmnd = sbatch_cmnd if sbatch_cmnd else 'sbatch' - qsub_cmnd = qsub_cmnd if qsub_cmnd else 'qsub' - if run.which(sbatch_cmnd): - MODE = 'slurm' - elif run.which(qsub_cmnd): - MODE = 'torque' - else: - MODE = 'local' - else: - MODE = conf_queue - if MODE == 'slurm' or MODE == 'torque': - logme.log('{} detected, using for cluster submissions'.format(MODE), - 'debug') - else: - logme.log('No cluster environment detected, using multiprocessing', - 'debug') - return MODE - - -############################## -# Check if queue is usable # -############################## - - -def check_queue(qtype=None): - """Raise exception if MODE is incorrect.""" - if 'MODE' not in globals(): - global MODE - MODE = get_cluster_environment() - if not MODE: - MODE = get_cluster_environment() - if qtype: - if qtype not in ALLOWED_MODES: - raise ClusterError('qtype value {} is not recognized, ' - .format(qtype) + - 'should be: local, torque, or slurm') - else: - if MODE not in ALLOWED_MODES: - MODE = qtype - return True - elif MODE not in ALLOWED_MODES: - raise ClusterError('MODE value {} is not recognized, '.format(MODE) + - 'should be: local, torque, or slurm') - -###################################################################### -# Expose the Queue waiting method without requiring a Queue object # -###################################################################### + def __init__(self, parent): + """Initialize with a parent.""" + self._cname = 'QueueChild' + self.parent = parent -def wait(jobs): - """Wait for jobs to finish. - - Args: - jobs: A single job or list of jobs to wait for. With torque or slurm, - these should be job IDs, with local mode, these are - multiprocessing job objects (returned by submit()) - """ - # Support python2, which hates reciprocal import for 80's reasons - from .job import Job - from .local import JobQueue - - check_queue() # Make sure the MODE is usable +################ +# Exceptions # +################ - # Sanitize argument - if not isinstance(jobs, (list, tuple)): - jobs = [jobs] - for job in jobs: - if not isinstance(job, (str, int, Job, JobQueue)): - raise ClusterError('job must be int, string, or Job, ' + - 'is {}'.format(type(job))) +class QueueError(Exception): - if MODE == 'local': - for job in jobs: - try: - job = int(job) - except TypeError: - raise TypeError('Job must be a Job object or job #.') - if not local.JQUEUE or not local.JQUEUE.runner.is_alive(): - raise ClusterError('Cannot wait on job ' + str(job) + - 'JobQueue does not exist') - local.JQUEUE.wait(job) - - elif MODE == 'torque': - # Wait for 1 seconds before checking, as jobs take a while to be queued - # sometimes - sleep(1) - - s = re.compile(r' +') # For splitting qstat output - # Jobs must be strings for comparison operations - jobs = [str(j) for j in jobs] - q = run.cmd('qstat -a', tries=8)[1].rstrip().split('\n') - # Check header - if not re.split(r' {2,100}', q[3])[9] == 'S': - raise ClusterError('Unrecognized torque qstat format') - # Build a list of completed jobs - complete = [] - for j in q[5:]: - i = s.split(j) - if i[9] == 'C': - complete.append(i[0].split('.')[0]) - # Build a list of all jobs - alljobs = [s.split(j)[0].split('.')[0] for j in q[5:]] - # Trim down job list - jobs = [j for j in jobs if j in alljobs] - jobs = [j for j in jobs if j not in complete] - if len(jobs) == 0: - return - elif MODE == 'slurm': - # Wait for 2 seconds before checking, as jobs take a while to be queued - # sometimes - sleep(2) + """Simple Exception wrapper.""" - # Jobs must be strings for comparison operations - jobs = [str(j) for j in jobs] - while True: - # Slurm allows us to get a custom output for faster parsing - q = check_output( - ['squeue', '-h', '-o', "'%A,%t'"]).decode().rstrip().split(',') - # Build a list of jobs - complete = [i[0] for i in q if i[1] == 'CD'] - failed = [i[0] for i in q if i[1] == 'F'] - allj = [i[0] for i in q] - # Trim down job list, ignore failures - jobs = [i for i in jobs if i not in allj] - jobs = [i for i in jobs if i not in complete] - jobs = [i for i in jobs if i not in failed] - if len(jobs) == 0: - return - sleep(2) + pass diff --git a/fyrd/run.py b/fyrd/run.py index 45d3d0e..8892c89 100644 --- a/fyrd/run.py +++ b/fyrd/run.py @@ -8,7 +8,6 @@ These functions are not intended to be accessed directly. """ -from __future__ import print_function from __future__ import with_statement import os as _os import re as _re @@ -22,12 +21,38 @@ from subprocess import PIPE from time import sleep -from six import reraise from six.moves import input as _get_input +# Progress bar handling +from tqdm import tqdm, tqdm_notebook +try: + if str(type(get_ipython())) == "": + _pb = tqdm_notebook + else: + _pb = tqdm +except NameError: + _pb = tqdm + from . import logme as _logme +def get_pbar(iterable, name=None, unit=None, **kwargs): + """Return a tqdm progress bar iterable. + + If progressbar is set to False in the config, will not be shown. + """ + from . import conf # Avoid reciprocal import issues + show_pb = bool(conf.get_option('queue', 'progressbar', True)) + if 'desc' in kwargs: + dname = kwargs.pop('desc') + name = name if name else dname + if 'disable' in kwargs: + disable = kwargs['disable'] + else: + disable = False if show_pb else True + return _pb(iterable, desc=name, unit=unit, disable=disable, **kwargs) + + ############################################################################### # Useful Classes # ############################################################################### @@ -61,9 +86,13 @@ def listify(iterable): return [iterable] if not iterable: return [] - if callable(iterable): - iterable = iterable() - return list(iter(iterable)) + # if callable(iterable): + # iterable = iterable() + try: + iterable = list(iterable) + except TypeError: + iterable = [iterable] + return iterable def merge_lists(lists): @@ -315,7 +344,6 @@ def cmd(command, args=None, stdout=None, stderr=None, tries=1): def export_run(function, args, kwargs): """Execute a function after first exporting all imports.""" kwargs['imports'] = export_imports(function, kwargs) - print('bob', kwargs['imports']) return function(*args, **kwargs) diff --git a/requirements.txt b/requirements.txt index 2024dc7..14be5de 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ dill>=0.2.5 tabulate>=0.7.7 six>=1.10.0 tblib>=1.3.0 +tqdm>=4.15.0 diff --git a/setup.py b/setup.py index 70b290f..6903028 100755 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ from setuptools.command.test import test as TestCommand log = setuptools.distutils.log -VERSION='0.6.1b9' +VERSION='0.6.2a1' GITHUB='https://github.com/MikeDacre/fyrd' ############################################################################### @@ -95,7 +95,7 @@ def run_tests(self): requires=['dill', 'tabulate', 'six', 'tblib'], install_requires=['dill', 'tabulate', 'six', 'tblib'], tests_require=['pytest'], - packages=['fyrd'], + packages=['fyrd', 'fyrd/batch_systems'], cmdclass={'test': TestRunner}, scripts=scpts, entry_points={ diff --git a/tests/local_queue.py b/tests/local_queue.py deleted file mode 100644 index 571b931..0000000 --- a/tests/local_queue.py +++ /dev/null @@ -1,264 +0,0 @@ -"""Test remote queues, we can't test local queues in py.test.""" -import os -import sys -import argparse -from datetime import datetime as dt -from datetime import timedelta as td -sys.path.append(os.path.abspath('.')) -import fyrd - -fyrd.logme.MIN_LEVEL = 'info' - - -############################################################################### -# Support Functions # -############################################################################### - - -def write_to_file(string, file): - """Write a string to a file.""" - with open(file, 'w') as fout: - fout.write(string + '\n') - return 0 - - -def raise_me(number, power=2): - """Raise number to power.""" - return number**power - - -def dosomething(x): - """Simple file operation.""" - out = [] - with fyrd.run.open_zipped(x) as fin: - for line in fin: - out.append((line, line.split('\t')[1]*2)) - return out - - -def dosomethingbad(x): - """Try to operate on a file, but do it stupidly.""" - out = [] - with open(x) as j: - out.append(j, j.split('\t')[1]*2) - - - -############################################################################### -# Class For Testing # -############################################################################### - - -class TestMe(object): - - """This class is just used to test method submission.""" - - def __init__(self): - """Initialize self.""" - self.me = 24 - self.out = None - - def do_math(self, number): - """Multiply self.me by number.""" - self.out = self.me*number - - def get_out(self): - """Return out.""" - return self.out - - -############################################################################### -# Test Functions # -############################################################################### - - -def test_job_creation(): - """Make a job and print it.""" - fyrd.queue.MODE = 'local' - job = fyrd.Job('echo hi', cores=2, time='00:02:00', mem='2000', - threads=4, clean_files=False, clean_outputs=False) - assert job.qtype == 'local' - return 0 - - -def test_job_execution(): - """Run a job and autoclean.""" - fyrd.queue.MODE = 'local' - job = fyrd.Job('echo hi', profile='default', clean_files=True, - clean_outputs=True).submit() - job.wait() - assert os.path.isfile(job.outfile) - assert os.path.isfile(job.errfile) - assert os.path.isfile(job.submission.file_name) - out = job.get() - assert not os.path.isfile(job.outfile) - assert not os.path.isfile(job.errfile) - assert not os.path.isfile(job.submission.file_name) - assert job.exitcode == 0 - assert out == 'hi\n' - assert job.stdout == 'hi\n' - assert job.stderr == '' - assert isinstance(job.start, dt) - assert isinstance(job.end, dt) - assert isinstance(job.runtime, td) - return 0 - - -def test_job_cleaning(): - """Delete intermediate files without autoclean.""" - fyrd.queue.MODE = 'local' - job = fyrd.Job('echo hi', profile='default', clean_files=False, - clean_outputs=False).submit() - job.wait() - assert os.path.isfile(job.outfile) - assert os.path.isfile(job.errfile) - assert os.path.isfile(job.submission.file_name) - job.submission.clean() - job.clean(delete_outputs=True) - assert not os.path.isfile(job.outfile) - assert not os.path.isfile(job.errfile) - assert not os.path.isfile(job.submission.file_name) - return 0 - - -def test_function_submission(): - """Submit a function.""" - failed = False - fyrd.queue.MODE = 'local' - job = fyrd.Job(write_to_file, ('42', 'bobfile')) - job.submit() - job.wait() - job.fetch_outputs() - out = job.get(delete_outfiles=False) - job.function.clean(delete_output=True) - job.clean() - assert job.exitcode == 0 - assert out == 0 - assert job.out == 0 - assert job.stdout == '\n' - if job.stderr != '': - sys.stderr.write('STDERR should be empty, but contains:\n') - sys.stderr.write(job.stderr) - failed = True - with open('bobfile') as fin: - assert fin.read().rstrip() == '42' - os.remove('bobfile') - job.clean(delete_outputs=True) - if failed: - return 1 - return 0 - - -def test_method_submission(): - """Submit a method.""" - t = TestMe() - job = fyrd.Job(t.do_math, (2,)) - t2 = job.get() - assert t2.get_out() == t.me*2 - - -def test_function_keywords(): - """Submit a simple function with keyword arguments.""" - job = fyrd.Job(raise_me, (10,), kwargs={'power': 10}).submit() - assert job.get() == 10**10 - job.clean(delete_outputs=True) - return 0 - - -def test_splitfile(): - """Use the splitfile helper function.""" - out = fyrd.helpers.splitrun(2, 'tests/test.txt.gz', - False, dosomething, ('{file}',)) - assert sorted(out) == sorted(dosomething('tests/test.txt.gz')) - return 0 - - -def test_splitfile_script(): - """Test splitfile() with a script and outfile.""" - out = fyrd.helpers.splitrun(2, 'tests/test.txt.gz', - False, dosomething, ('{file}',)) - assert out == dosomething('tests/test.txt.gz') - return 0 - - -def test_splitfile_indirect(): - """Use the splitfile helper function.""" - job = fyrd.helpers.splitrun(2, 'tests/test.txt.gz', - False, dosomething, ('{file}',), direct=False) - out = job.get() - assert sorted(out) == sorted(dosomething('tests/test.txt.gz')) - return 0 - - -def test_splitfile_bad(): - """Use the splitfile helper function and fail.""" - if not fyrd.logme.MIN_LEVEL == 'debug': - old_level = fyrd.logme.MIN_LEVEL - fyrd.logme.MIN_LEVEL = 'critical' - try: - fyrd.helpers.splitrun(2, 'tests/test.txt.gz', - False, dosomethingbad, ('{file}',)) - except AttributeError: - fyrd.basic.clean_dir('.', delete_outputs=True, confirm=False) - try: - os.remove('test.txt.gz.split_0001.gz') - except OSError: - pass - try: - os.remove('test.txt.gz.split_0002.gz') - except OSError: - pass - return 0 - finally: - if not fyrd.logme.MIN_LEVEL == 'debug': - fyrd.logme.MIN_LEVEL = old_level - - -def test_dir_clean(): - """Clean all job files in this dir.""" - fyrd.basic.clean_dir(delete_outputs=True) - return 0 - - -def main(argv=None): - """Get arguments and run tests.""" - if not argv: - argv = sys.argv[1:] - - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.RawDescriptionHelpFormatter) - - parser.add_argument('-v', '--verbose', action="store_true", - help="Verbose") - - args = parser.parse_args(argv) - - if args.verbose: - fyrd.logme.MIN_LEVEL = 'debug' - - count = 0 - test_job_creation() - test_job_execution() - print('Cleaning') - count += test_job_cleaning() - print('Function submission') - count += test_function_submission() - print('Function keywords') - count += test_function_keywords() - print('Dir clean') - count += test_dir_clean() - # These tests frequently stall, I don't know why. - # print('Splitfile') - # count += test_splitfile() - # print('Splitfile Script') - # count += test_splitfile_script() - # print('Splitfile bad') - # count += test_splitfile_bad() - if count > 0: - sys.stderr.write('Some tests failed') - return count - sys.stdout.write('Tests complete\n') - -if __name__ == '__main__' and '__file__' in globals(): - sys.exit(main()) diff --git a/tests/options_help.txt b/tests/options_help.txt index d961785..c2fd454 100644 --- a/tests/options_help.txt +++ b/tests/options_help.txt @@ -30,10 +30,6 @@ Used for function calls:: imports: Imports to be used in function calls (e.g. sys, os) Type: list; Default: None -Used only in local mode:: -threads: Number of threads to use on the local machine - Type: int; Default: 4 - Options that work in both slurm and torque:: nodes: Number of nodes to request Type: int; Default: 1 diff --git a/tests/run_tests.py b/tests/run_tests.py index ec6678a..1f6dffc 100755 --- a/tests/run_tests.py +++ b/tests/run_tests.py @@ -7,7 +7,7 @@ AUTHOR: Michael D Dacre, mike.dacre@gmail.com CREATED: 2016-54-22 15:06 - Last modified: 2016-12-01 18:03 + Last modified: 2017-08-05 23:54 DESCRIPTION: Run multiple kinds of tests, provide options to skip some. @@ -46,7 +46,7 @@ def main(argv=None): if os.path.exists('.coverage'): os.remove('.coverage') cmnd = ['coverage', 'run', '-a', '--source', 'fyrd'] - pytt = ['py.test', '--cov=fyrd'] + pytt = cmnd + ['-m', 'pytest', '--cov=fyrd'] else: cmnd = [sys.executable] pytt = ['py.test'] @@ -60,15 +60,6 @@ def main(argv=None): print('py.test tests complete with code {}, running local queue test.' .format(outcode)) - local_args = cmnd + ['tests/local_queue.py'] - if args.verbose: - local_args.append('-v') - - outcode += call(local_args) - - print('local test complete with outcode {}.' - .format(outcode)) - return outcode if __name__ == '__main__' and '__file__' in globals(): diff --git a/tests/test_config.py b/tests/test_config.py index 3077c81..f3574d7 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -5,6 +5,10 @@ import fyrd +def test_get_config(): + """Simply get the config.""" + fyrd.conf.get_config() + def test_change_file(): """Change the default config file to here.""" fyrd.conf.CONFIG_FILE = os.path.abspath('conftest') @@ -17,6 +21,7 @@ def test_create(): {'jobs':{'profile_file': os.path.abspath('proftest')}} ) assert os.path.isfile(os.path.abspath('conftest')) + fyrd.conf.load_config() def test_get(): diff --git a/tests/test_local.py b/tests/test_local.py deleted file mode 100644 index 090be38..0000000 --- a/tests/test_local.py +++ /dev/null @@ -1,89 +0,0 @@ -"""Test misc functions.""" -import os -import sys -import pytest -sys.path.append(os.path.abspath('.')) -import fyrd -env = fyrd.get_cluster_environment() - - -@pytest.mark.skipif(env == 'local', - reason="Implemented elsewhere") -def test_job_creation(): - """Make a job and print it.""" - env = 'local' - fyrd.queue.MODE = 'local' - job = fyrd.Job('echo hi', cores=2, time='00:02:00', mem='2000', - threads=4, qtype='local') - assert job.qtype == 'local' - env = fyrd.get_cluster_environment() - fyrd.queue.MODE = env - - -def test_raiser(): - """Test the raiser function.""" - with pytest.raises(Exception): - try: - raise(Exception('bob')) - except Exception: - i = sys.exc_info() - assert fyrd.run.is_exc(i) - fyrd.run.raiser(i) - - -def simple_iterator(): - """Use with test_listify.""" - for i in [1,2,3,4]: - yield i - - -def test_listify(): - """Test the listify function.""" - assert fyrd.run.listify('hi') == ['hi'] - assert fyrd.run.listify(1) == [1] - assert fyrd.run.listify(('hi',)) == ['hi'] - assert fyrd.run.listify(('hi',)) == ['hi'] - assert fyrd.run.listify(simple_iterator()) == [1,2,3,4] - assert fyrd.run.listify(simple_iterator) == [1,2,3,4] - - -def test_count_lines(): - """Test the line counter.""" - assert fyrd.run.count_lines('tests/test.txt.gz') == 2200 - assert fyrd.run.count_lines('tests/test.txt.gz', True) == 2200 - - -def test_file_type(): - """Test fyrd.run.file_type().""" - assert fyrd.run.file_type('my_file.txt') == 'txt' - assert fyrd.run.file_type('my_file.txt.gz') == 'txt' - assert fyrd.run.file_type('my_file.txt.bz2') == 'txt' - assert fyrd.run.file_type('/a/dir/my_file.txt') == 'txt' - assert fyrd.run.file_type('/a/dir/my_file.txt.gz') == 'txt' - assert fyrd.run.file_type('/a/dir/my_file.txt.bz2') == 'txt' - assert fyrd.run.file_type('my_file.bob.txt') == 'txt' - assert fyrd.run.file_type('my_file.bob.txt.gz') == 'txt' - assert fyrd.run.file_type('my_file.bob.txt.bz2') == 'txt' - - -def test_is_file_type(): - """Test fyrd.run.is_file_type().""" - assert fyrd.run.is_file_type('bob.txt', 'txt') - assert fyrd.run.is_file_type('bob.txt', ['txt', 'john']) - assert fyrd.run.is_file_type('bob.john', ['txt', 'john']) - assert fyrd.run.is_file_type('bob.fred', ['txt', 'john']) is False - with open('./tests/test.txt.gz') as fin: - assert fyrd.run.is_file_type(fin, 'txt') - - -def test_replace_args(): - """Test fyrd.run.replace_argument().""" - arg1 = ('{file}',) - arg2 = dict(file='{file}') - arg3 = [arg1, arg2] - out1 = ('spam',) - out2 = dict(file='spam') - out3 = [out1, out2] - assert fyrd.run.replace_argument(arg1, '{file}', 'spam') == out1 - assert fyrd.run.replace_argument(arg2, '{file}', 'spam') == out2 - assert fyrd.run.replace_argument(arg3, '{file}', 'spam') == out3 diff --git a/tests/test_options.py b/tests/test_options.py index 2b8e545..53e8bba 100644 --- a/tests/test_options.py +++ b/tests/test_options.py @@ -5,7 +5,6 @@ import pytest sys.path.append(os.path.abspath('.')) import fyrd -fyrd.local.THREADS = 5 def test_help(): @@ -24,24 +23,20 @@ def test_help(): def test_dict_types(): """Make sure all expected dictionaries exist and have the right type.""" assert hasattr(fyrd.options, 'COMMON') - assert hasattr(fyrd.options, 'NORMAL') assert hasattr(fyrd.options, 'CLUSTER_OPTS') assert hasattr(fyrd.options, 'TORQUE') assert hasattr(fyrd.options, 'SLURM') assert hasattr(fyrd.options, 'SLURM_KWDS') assert hasattr(fyrd.options, 'TORQUE_KWDS') assert hasattr(fyrd.options, 'CLUSTER_KWDS') - assert hasattr(fyrd.options, 'NORMAL_KWDS') assert hasattr(fyrd.options, 'ALLOWED_KWDS') assert isinstance(fyrd.options.COMMON, OrderedDict) - assert isinstance(fyrd.options.NORMAL, OrderedDict) assert isinstance(fyrd.options.CLUSTER_OPTS, OrderedDict) assert isinstance(fyrd.options.TORQUE, OrderedDict) assert isinstance(fyrd.options.SLURM, OrderedDict) assert isinstance(fyrd.options.SLURM_KWDS, OrderedDict) assert isinstance(fyrd.options.TORQUE_KWDS, OrderedDict) assert isinstance(fyrd.options.CLUSTER_KWDS, OrderedDict) - assert isinstance(fyrd.options.NORMAL_KWDS, OrderedDict) assert isinstance(fyrd.options.ALLOWED_KWDS, OrderedDict) @@ -159,10 +154,6 @@ def test_string_formatting(): '#SBATCH -o joe', '#SBATCH -p large' ] - assert fyrd.options.options_to_string( - test_options, - qtype='local' - ) == '\n' with pytest.raises(fyrd.options.OptionsError): fyrd.options.option_to_string('nodes', 2) with pytest.raises(ValueError): @@ -171,4 +162,4 @@ def test_string_formatting(): def test_back_to_normal(): """Return the queue to the normal setting.""" - fyrd.queue.get_cluster_environment() + fyrd.batch_systems.get_cluster_environment() diff --git a/tests/test_pandas.py b/tests/test_pandas.py index 74decad..7f3a166 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -3,9 +3,11 @@ Pandas is hard to install, so this isn't part of the travis py.test. """ +import os import sys import argparse from uuid import uuid4 +sys.path.append(os.path.abspath('.')) import fyrd import pytest try: @@ -14,7 +16,7 @@ canrun = True except ImportError: canrun = False -env = fyrd.get_cluster_environment() +env = fyrd.batch_systems.get_cluster_environment() ############################################################################### @@ -203,7 +205,7 @@ def main(argv=None): args = parser.parse_args(argv) if args.local: - fyrd.queue.MODE = 'local' + fyrd.batch_systems.MODE = 'local' if args.verbose: fyrd.logme.MIN_LEVEL = 'debug' diff --git a/tests/test_queue.py b/tests/test_queue.py index 0f5ccc8..2d1abec 100644 --- a/tests/test_queue.py +++ b/tests/test_queue.py @@ -1,32 +1,31 @@ """Test remote queues, we can't test local queues in py.test.""" import os import sys -import pytest sys.path.append(os.path.abspath('.')) import fyrd -env = fyrd.queue.get_cluster_environment() +env = fyrd.batch_systems.get_cluster_environment() def test_queue_inspection(): """Make sure that if qsub or sbatch are available, the queue is right.""" queue_type = fyrd.conf.get_option('queue', 'queue_type') if queue_type != 'auto': - assert fyrd.queue.MODE == queue_type + assert fyrd.batch_systems.MODE == queue_type cfile = fyrd.conf.CONFIG_FILE fyrd.conf.CONFIG_FILE = 'conftest' fyrd.conf.create_config() - fyrd.queue.get_cluster_environment() + fyrd.batch_systems.get_cluster_environment() if fyrd.run.which('sbatch'): - assert fyrd.queue.MODE == 'slurm' + assert fyrd.batch_systems.MODE == 'slurm' elif fyrd.run.which('qsub'): - assert fyrd.queue.MODE == 'torque' + assert fyrd.batch_systems.MODE == 'torque' else: - assert fyrd.queue.MODE == 'local' + assert fyrd.batch_systems.MODE == 'local' if queue_type != 'auto': fyrd.conf.CONFIG_FILE = cfile os.remove('conftest') - fyrd.queue.get_cluster_environment() - assert env == fyrd.queue.MODE + fyrd.batch_systems.get_cluster_environment() + assert env == fyrd.batch_systems.MODE def test_queue_creation(): @@ -36,9 +35,3 @@ def test_queue_creation(): queue = fyrd.Queue() assert queue.qtype == env len(queue) - - -def test_queue_parsers(): - """Test the queue parsers.""" - with pytest.raises(fyrd.ClusterError): - fyrd.queue.queue_parser('local') diff --git a/tests/test_remote.py b/tests/test_remote.py index 2e23d43..a2c461b 100644 --- a/tests/test_remote.py +++ b/tests/test_remote.py @@ -7,7 +7,7 @@ sys.path.append(os.path.abspath('.')) import fyrd -env = fyrd.get_cluster_environment() +env = fyrd.batch_systems.get_cluster_environment() fyrd.logme.MIN_LEVEL = 'debug' @@ -78,15 +78,32 @@ def do_math(self, number): ############################################################################### +def test_torque_array_normalization(): + """Torque specific job normalization.""" + batch = fyrd.batch_systems.get_batch_system('torque') + assert batch.normalize_job_id('12345[24]') == ('12345', '24') + assert batch.normalize_job_id('12345[]') == ('12345', None) + + +def test_torque_state_normalization(): + """Torque specific job normalization.""" + batch = fyrd.batch_systems.get_batch_system('torque') + assert batch.normalize_state('R') == 'running' + assert batch.normalize_state('c') == 'completed' + + +def test_slurm_array_normalization(): + """Torque specific job normalization.""" + batch = fyrd.batch_systems.get_batch_system('slurm') + assert batch.normalize_job_id('12345_24') == ('12345', '24') + + def test_job_creation(): """Make a job and print it.""" - job = fyrd.Job('echo hi', cores=2, time='00:02:00', mem='2000', - threads=4, clean_files=False, clean_outputs=False) + job = fyrd.Job('echo hi', cores=2, time='00:02:00', mem='2000') assert job.qtype == env -@pytest.mark.skipif(env == 'local', - reason="Fails in local mode") def test_job_execution(): """Run a job and autoclean.""" job = fyrd.Job('echo hi', profile='default', clean_files=True, @@ -115,8 +132,56 @@ def test_job_execution(): assert isinstance(job.runtime, td) -@pytest.mark.skipif(env == 'local', - reason="Fails in local mode") +def test_multi_job_get(): + """Run a job and autoclean.""" + job = fyrd.Job('echo hi', profile='default', clean_files=True, + clean_outputs=True).submit() + job2 = fyrd.Job('echo ho', profile='default', clean_files=True, + clean_outputs=True).submit() + outs = fyrd.basic.get([job, job2]) + assert outs[0] == 'hi\n' + assert outs[1] == 'ho\n' + assert job.stdout == 'hi\n' + assert job2.stdout == 'ho\n' + assert not os.path.isfile(job.outfile) + assert not os.path.isfile(job2.outfile) + + +def test_basic_job(): + """Run a job using the basic.py submit function.""" + fyrd.basic.make_job('echo hi', profile='default') + job = fyrd.submit('echo hi', profile='default') + job.wait() + print(repr(job)) + print(str(job)) + print(repr(job.submission)) + print(str(job.submission)) + print(job.outfile) + assert os.path.isfile(job.outfile) + assert os.path.isfile(job.errfile) + assert os.path.isfile(job.submission.file_name) + out = job.get() + assert not os.path.isfile(job.outfile) + assert not os.path.isfile(job.errfile) + assert not os.path.isfile(job.submission.file_name) + sys.stdout.write('{};\nSTDOUT: {}\nSTDERR: {}\n' + .format(job.exitcode, job.stdout, job.stderr)) + assert job.exitcode == 0 + assert out == 'hi\n' + assert job.stdout == 'hi\n' + assert job.stderr == '' + assert isinstance(job.start, dt) + assert isinstance(job.end, dt) + assert isinstance(job.runtime, td) + + +def test_make_job_file(): + """Use the basic function to make a job file.""" + job_file = fyrd.basic.make_job_file('echo hi') + assert os.path.isfile(job_file) + os.remove(job_file) + + def test_job_execution_paths(): """Run a job and autoclean with defined paths.""" os.makedirs('out') @@ -144,11 +209,10 @@ def test_job_execution_paths(): assert isinstance(job.start, dt) assert isinstance(job.end, dt) assert isinstance(job.runtime, td) + os.removedirs('out') os.system('rm -rf {}'.format('out')) -@pytest.mark.skipif(env == 'local', - reason="Fails in local mode") def test_job_params(): """Run a job with some explicit parameters set.""" job = fyrd.Job('echo ho', profile='default', clean_files=True, @@ -160,8 +224,6 @@ def test_job_params(): assert job.stderr == '' -@pytest.mark.skipif(env == 'local', - reason="Fails in local mode") def test_outfiles(): """Run a job with outfile and errfile overriden parameters set.""" job = fyrd.Job('echo ho', profile='default', clean_files=True, @@ -173,17 +235,12 @@ def test_outfiles(): assert job.stderr == '' -@pytest.mark.skipif(env == 'local', - reason="Fails in local mode") def test_depends(): """Run some jobs with dependencies.""" job = fyrd.Job('sleep 3', profile='default', clean_files=True, clean_outputs=True) job.submit() job.submit() # Test submission abort - with pytest.raises(fyrd.ClusterError): - job2 = fyrd.Job('echo eggs', profile='default', clean_files=True, - clean_outputs=True, depends='job').submit() job2 = fyrd.Job('echo eggs', profile='default', clean_files=True, clean_outputs=True, depends=job).submit() out = job2.get() @@ -198,8 +255,6 @@ def test_depends(): assert job3.stderr == '' -@pytest.mark.skipif(env == 'local', - reason="Fails in local mode") def test_resubmit(): """Alter a job and resubmit.""" job = fyrd.Job('echo ho', profile='default', clean_files=True, @@ -209,16 +264,14 @@ def test_resubmit(): assert out == 'ho\n' assert job.stdout == 'ho\n' assert job.stderr == '' - # job.command = 'echo hi' + job.command = 'echo hi' job.resubmit() out = job.get() - assert out == 'ho\n' - assert job.stdout == 'ho\n' + assert out == 'hi\n' + assert job.stdout == 'hi\n' assert job.err == '' -@pytest.mark.skipif(env == 'local', - reason="Fails in local mode") def test_job_cleaning(): """Delete intermediate files without autoclean.""" job = fyrd.Job('echo hi', profile='default', clean_files=False, @@ -234,8 +287,28 @@ def test_job_cleaning(): assert not os.path.isfile(job.submission.file_name) -@pytest.mark.skipif(env == 'local', - reason="Fails in local mode") +def test_multi_job_cleaning(): + """Delete intermediate files for more than one job.""" + job = fyrd.Job('echo hi', profile='default', clean_files=False, + clean_outputs=False).submit() + job2 = fyrd.Job('echo ho', profile='default', clean_files=False, + clean_outputs=False).submit() + fyrd.basic.wait([job, job2]) + assert os.path.isfile(job.outfile) + assert os.path.isfile(job.errfile) + assert os.path.isfile(job.submission.file_name) + assert os.path.isfile(job2.outfile) + assert os.path.isfile(job2.errfile) + assert os.path.isfile(job2.submission.file_name) + fyrd.basic.clean([job, job2], clean_outputs=True) + assert not os.path.isfile(job.outfile) + assert not os.path.isfile(job.errfile) + assert not os.path.isfile(job.submission.file_name) + assert not os.path.isfile(job2.outfile) + assert not os.path.isfile(job2.errfile) + assert not os.path.isfile(job2.submission.file_name) + + def test_function_submission(): """Submit a function.""" job = fyrd.Job(write_to_file, ('42', 'bobfile'), clean_files=False) @@ -266,8 +339,6 @@ def test_function_submission(): job.clean(delete_outputs=True) -@pytest.mark.skipif(env == 'local', - reason="Fails in local mode") def test_method_submission(): """Submit a method.""" t = MethodSubmission() @@ -276,8 +347,6 @@ def test_method_submission(): assert t2 == t.me*2 -@pytest.mark.skipif(env == 'local', - reason="Fails in local mode") def test_function_keywords(): """Submit a simple function with keyword arguments.""" job = fyrd.Job(raise_me, (10,), kwargs={'power': 10}).submit() @@ -285,8 +354,6 @@ def test_function_keywords(): job.clean(delete_outputs=True) -@pytest.mark.skipif(env == 'local', - reason="Fails in local mode") def test_splitfile(): """Use the splitfile helper function.""" out = fyrd.helpers.splitrun(2, 'tests/test.txt.gz', @@ -294,8 +361,6 @@ def test_splitfile(): assert out == dosomething('tests/test.txt.gz') -@pytest.mark.skipif(env == 'local', - reason="Fails in local mode") def test_splitfile_script(): """Test splitfile() with a script and outfile.""" out = fyrd.helpers.splitrun(2, 'tests/test.txt.gz', @@ -303,32 +368,20 @@ def test_splitfile_script(): assert out == dosomething('tests/test.txt.gz') -@pytest.mark.skipif(env == 'local', - reason="Fails in local mode") def test_splitfile_indirect(): """Use the splitfile helper function.""" job = fyrd.helpers.splitrun( 2, 'tests/test.txt.gz', False, SCRIPT, name='test', - outfile='test.out.txt', direct=False) + outfile='test.out.txt', direct=False + ) job.wait() + os.remove('test.txt.gz.split_0001.gz.out') + os.remove('test.txt.gz.split_0002.gz.out') assert os.path.isfile('test.out.txt') os.remove('test.out.txt') return 0 -# This test does not currently work -# @pytest.mark.skipif(env == 'local', - # reason="Fails in local mode") -# def test_splitfile_bad(): - # """Use the splitfile helper function and fail.""" - # with pytest.raises(AttributeError): - # fyrd.helpers.splitrun(2, 'tests/test.txt.gz', - # False, dosomethingbad, ('{file}',)) - # scriptpath = fyrd.conf.get_job_paths(dict())[3] - # for i in ['test.txt.gz.split_0001.gz', 'test.txt.gz.split_0002.gz']: - # os.remove(os.path.join(scriptpath, i)) - - def test_dir_clean(): """Clean all job files in this dir.""" fyrd.basic.clean_dir(delete_outputs=True) diff --git a/tests/test_run.py b/tests/test_run.py new file mode 100644 index 0000000..98396ed --- /dev/null +++ b/tests/test_run.py @@ -0,0 +1,27 @@ +"""Test things in run.py that are not used often.""" +import os +import fyrd +fyrd.logme.MIN_LEVEL = 'debug' + +def test_write_iterable(): + """Run the write_iterable function.""" + fyrd.run.write_iterable(['hi', 'there'], 'test.out') + with open('test.out') as infile: + contents = infile.read() + assert contents == 'hi\nthere' + os.remove('test.out') + +def test_file_type(): + """Test file type parsing.""" + assert fyrd.run.file_type('file.txt') == 'txt' + assert fyrd.run.file_type('file.txt.gz') == 'txt' + assert fyrd.run.file_type('file.txt.bz2') == 'txt' + with open('hi.txt', 'w') as fout: + assert fyrd.run.is_file_type(fout, 'txt') + os.remove('hi.txt') + assert not fyrd.run.is_file_type('file.txt', 'bob') + +def test_which(): + """Test getting paths.""" + ls = fyrd.run.which('ls') + ls = fyrd.run.which(ls) diff --git a/tests/write_options_to_file.py b/tests/write_options_to_file.py index db226a3..37ca22e 100755 --- a/tests/write_options_to_file.py +++ b/tests/write_options_to_file.py @@ -9,7 +9,7 @@ ORGANIZATION: Stanford University LICENSE: MIT License, property of Stanford, use as you wish CREATED: 2016-31-16 23:06 - Last modified: 2016-11-04 17:35 + Last modified: 2017-08-05 23:55 ============================================================================ """ @@ -17,7 +17,6 @@ import sys sys.path.append(os.path.abspath('../')) import fyrd -fyrd.local.THREADS = 5 with open('options_help.txt', 'w') as fout: fout.write(fyrd.option_help(mode='string'))