Use a file for ESPEI inputs.

gh#28 closes PhasesResearchLab#27 * ESPEI deprecates command line input arguments for a YAML (or JSON, others possible) input file. * Files are validated using [cerberus](http://docs.python-cerberus.org/en/stable/index.html) with a schema. * Support for setting chains per parameter and the standard deviation of the chains. * Include tests for several different possible runs Almost all of the constraints are handled by cerberus including * checking for parameter conflicts * handling enumeration options (e.g. choose either 'linear' or 'exponential' models, validated with regex) * Checking for filetype compatibility (again, regex)
RushiGong · Sep 18, 2017 · b1272af · b1272af
1 parent 78b07e6
commit b1272af
Show file tree

Hide file tree

Showing 7 changed files with 386 additions and 121 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,2 +1,3 @@
 include versioneer.py
 include espei/_version.py
+include espei/input-schema.yaml
diff --git a/espei/__init__.py b/espei/__init__.py
@@ -2,4 +2,24 @@
 __version__ = get_versions()['version']
 del get_versions
 
+import os
+import yaml
+from cerberus import Validator
+
+MODULE_DIR = os.path.dirname(os.path.abspath(__file__))
+
+# extension for iseven
+class ESPEIValidator(Validator):
+    def _validate_iseven(self, iseven, field, value):
+        """ Test the oddity of a value.
+
+        The rule's arguments are validated against this schema:
+        {'type': 'boolean'}
+        """
+        if iseven and bool(value & 1):
+            self._error(field, "Must be an even number")
+
+with open(os.path.join(MODULE_DIR, 'input-schema.yaml')) as f:
+    schema = ESPEIValidator(yaml.load(f))
+
 from espei.paramselect import fit
diff --git a/espei/input-schema.yaml b/espei/input-schema.yaml
@@ -0,0 +1,91 @@
+# core run settings
+system: # phase models and input data
+  type: dict
+  schema:
+    phase_models: # describes the CALPHAD models for the phases
+      type: string
+      required: True
+      regex: '.*\.json$'
+    datasets: # path to datasets. Defaults to current directory.
+      type: string
+      required: True
+
+output:
+  type: dict
+  default: {}
+  schema:
+    verbosity: # integer verbosity level 0 | 1 | 2, where 2 is most verbose.
+      type: integer
+      min: 0
+      max: 2
+      default: 0
+      required: True
+    output_db:
+      type: string
+      default: out.tdb
+    tracefile: # name of the file containing the mcmc chain array
+      type: string
+      default: chain.npy
+      regex: '.*\.npy$'
+    probfile: # name of the file containing the mcmc ln probability array
+      type: string
+      default: lnprob.npy
+      regex: '.*\.npy$'
+
+## if present, will do a single phase fitting
+generate_parameters:
+  type: dict
+  schema:
+    excess_model:
+      type: string
+      required: True
+      regex: 'linear'
+    ref_state:
+      type: string
+      required: True
+      regex: 'SGTE91'
+
+## if present, will run mcmc fitting
+## you must specifiy some kind of input for the parameters.
+## Parameters can come from
+##   1. a preceding generate_parameters step
+##   2. by generating chains from a previous input_db
+##   3. by using chains from a restart_chain for phases in an input_db
+mcmc:
+  type: dict
+  oneof_dependencies:
+    - 'mcmc.input_db'
+    - 'generate_parameters'
+  schema:
+    mcmc_steps:
+      type: integer
+      min: 1
+      required: True
+    mcmc_save_interval:
+      type: integer
+      default: 20
+      min: 1
+      required: True
+    scheduler: # scheduler to use for parallelization
+      type: string
+      default: dask # dask | MPIPool
+      regex: 'dask|MPIPool'
+      required: True
+    input_db: # TDB file used to start the mcmc run
+      type: string
+    restart_chain: # restart the mcmc fitting from a previous calculation
+      type: string
+      dependencies: input_db
+      regex: '.*\.npy$'
+    chains_per_parameter: # even integer multiple of number of chains corresponding to on parameter
+      type: integer
+      iseven: True
+      min: 2
+      allof:
+        - required: True
+        - excludes: restart_chain
+    chain_std_deviation: # fraction of a parameter for the standard deviation in the walkers
+      min: 0
+      allof:
+        - required: True
+        - excludes: restart_chain
diff --git a/espei/paramselect.py b/espei/paramselect.py
@@ -622,7 +622,7 @@ def lnprob(params, data=None, comps=None, dbf=None, phases=None, datasets=None,
 
 def fit(input_fname, datasets, resume=None, scheduler=None, run_mcmc=True,
         tracefile=None, probfile=None, restart_chain=None, mcmc_steps=1000,
-        save_interval=100):
+        save_interval=100, chains_per_parameter=2, chain_std_deviation=0.1):
     """Fit thermodynamic and phase equilibria data to a model.
     
     Parameters
@@ -652,6 +652,12 @@ def fit(input_fname, datasets, resume=None, scheduler=None, run_mcmc=True,
         int (Default value = 1000)
     save_interval : int
         interval of steps to save the chain to the tracefile.
+    chains_per_parameter : int
+        number of chains for each parameter. Must be an even integer greater or
+        equal to 2. Defaults to 2.
+    chain_std_deviation : float
+        standard deviation of normal for parameter initialization as a fraction
+        of each parameter. Must be greater than 0. Default is 0.1, which is 10%.
 
     Returns
     -------
@@ -759,9 +765,9 @@ def save_sampler_state(sampler):
             initial_parameters = np.array(initial_parameters)
             logging.debug('Initial parameters: {}'.format(initial_parameters))
             ndim = len(initial_parameters)
-            nwalkers = 2*ndim # walkers must be of size (2n*ndim)
+            nwalkers = chains_per_parameter*ndim # walkers must be of size (2n*ndim)
             initial_walkers = np.tile(initial_parameters, (nwalkers, 1))
-            walkers = rng.normal(initial_walkers, np.abs(initial_walkers*0.10))
+            walkers = rng.normal(initial_walkers, np.abs(initial_walkers*chain_std_deviation))
 
         # set up with emcee
         import emcee