From 092771e542b9c2cfef466fc02e1fdb771e051e30 Mon Sep 17 00:00:00 2001 From: Weixuan Date: Fri, 12 Jun 2020 10:28:42 -0400 Subject: [PATCH 01/39] fix issue #1082 --- progress.txt | 0 tpot/base.py | 3 ++- tpot/gp_deap.py | 22 ++++++++++++++++------ 3 files changed, 18 insertions(+), 7 deletions(-) delete mode 100644 progress.txt diff --git a/progress.txt b/progress.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/tpot/base.py b/tpot/base.py index 69361799..427a2ced 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -722,7 +722,8 @@ def pareto_eq(ind1, ind2): pbar=self._pbar, halloffame=self._pareto_front, verbose=self.verbosity, - per_generation_function=self._check_periodic_pipeline + per_generation_function=self._check_periodic_pipeline, + log_file=self.log_file ) # Allow for certain exceptions to signal a premature fit() cancellation diff --git a/tpot/gp_deap.py b/tpot/gp_deap.py index 1dc74067..d364b2b7 100644 --- a/tpot/gp_deap.py +++ b/tpot/gp_deap.py @@ -171,7 +171,8 @@ def initialize_stats_dict(individual): def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, ngen, pbar, - stats=None, halloffame=None, verbose=0, per_generation_function=None): + stats=None, halloffame=None, verbose=0, + per_generation_function=None, log_file=None): """This is the :math:`(\mu + \lambda)` evolutionary algorithm. :param population: A list of individuals. :param toolbox: A :class:`~deap.base.Toolbox` that contains the evolution @@ -189,6 +190,7 @@ def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, ngen, pbar, :param verbose: Whether or not to log the statistics. :param per_generation_function: if supplied, call this function before each generation used by tpot to save best pipeline before each new generation + :param log_file: io.TextIOWrapper or io.StringIO, optional (defaul: sys.stdout) :returns: The final population :returns: A class:`~deap.tools.Logbook` with the statistics of the evolution. @@ -252,18 +254,26 @@ def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, ngen, pbar, if not pbar.disable: # Print only the best individual fitness if verbose == 2: - high_score = max(halloffame.keys[x].wvalues[1] for x in range(len(halloffame.keys))) - pbar.fp.write('\nGeneration {0} - Current best internal CV score: {1}'.format(gen, high_score)) + high_score = max(halloffame.keys[x].wvalues[1] \ + for x in range(len(halloffame.keys))) + pbar.write('Generation {0} - Current ' + 'best internal CV score: {1}'.format(gen, + high_score), + + file=log_file) # Print the entire Pareto front elif verbose == 3: - pbar.fp.write('\nGeneration {} - Current Pareto front scores:'.format(gen)) + pbar.write('\nGeneration {} - ' + 'Current Pareto front scores:'.format(gen), + file=log_file) for pipeline, pipeline_scores in zip(halloffame.items, reversed(halloffame.keys)): - pbar.fp.write('\n{}\t{}\t{}'.format( + pbar.write('\n{}\t{}\t{}'.format( int(pipeline_scores.wvalues[0]), pipeline_scores.wvalues[1], pipeline - ) + ), + file=log_file ) # after each population save a periodic pipeline From a508e0cd2a5730f998b60995e93c981f9334c48b Mon Sep 17 00:00:00 2001 From: Weixuan Date: Fri, 12 Jun 2020 10:31:18 -0400 Subject: [PATCH 02/39] refine log file --- progress.txt | 0 tpot/gp_deap.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 progress.txt diff --git a/progress.txt b/progress.txt new file mode 100644 index 00000000..e69de29b diff --git a/tpot/gp_deap.py b/tpot/gp_deap.py index d364b2b7..53a25b26 100644 --- a/tpot/gp_deap.py +++ b/tpot/gp_deap.py @@ -256,7 +256,7 @@ def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, ngen, pbar, if verbose == 2: high_score = max(halloffame.keys[x].wvalues[1] \ for x in range(len(halloffame.keys))) - pbar.write('Generation {0} - Current ' + pbar.write('\nGeneration {0} - Current ' 'best internal CV score: {1}'.format(gen, high_score), From ae7fa5cbadcdbe17af383befe903ec8d2d42c94f Mon Sep 17 00:00:00 2001 From: Weixuan Date: Fri, 12 Jun 2020 11:02:58 -0400 Subject: [PATCH 03/39] remove log in unit test --- progress.txt | 0 tests/test_log_file.py | 14 +++++++++++--- tests/tpot_tests.py | 6 +++++- 3 files changed, 16 insertions(+), 4 deletions(-) delete mode 100644 progress.txt diff --git a/progress.txt b/progress.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/test_log_file.py b/tests/test_log_file.py index f42c2445..b9442eb8 100644 --- a/tests/test_log_file.py +++ b/tests/test_log_file.py @@ -28,6 +28,8 @@ from nose.tools import assert_equal, assert_true import os import re +from tempfile import mkdtemp +from shutil import rmtree data = load_iris() X = data['data'] @@ -38,7 +40,8 @@ def test_log_file_verbosity_1(): """ Set verbosity as 1. Assert log_file parameter to generate log file. """ - file_name = "progress_verbose_1.log" + cachedir = mkdtemp() + file_name = cachedir + "progress_verbose_1.log" tracking_progress_file = open(file_name, "w") tpot_obj = TPOTClassifier( population_size=POP_SIZE, @@ -48,10 +51,12 @@ def test_log_file_verbosity_1(): ) tpot_obj.fit(X, y) assert_equal(os.path.getsize(file_name), 0) + rmtree(cachedir) def test_log_file_verbosity_2(): """ Set verbosity as 2. Assert log_file parameter to generate log file. """ - file_name = "progress_verbose_2.log" + cachedir = mkdtemp() + file_name = cachedir + "progress_verbose_2.log" tracking_progress_file = open(file_name, "w") tpot_obj = TPOTClassifier( population_size=POP_SIZE, @@ -62,10 +67,12 @@ def test_log_file_verbosity_2(): tpot_obj.fit(X, y) assert_equal(os.path.getsize(file_name) > 0, True) check_generations(file_name, GEN_SIZE) + rmtree(cachedir) def test_log_file_verbose_3(): """ Set verbosity as 3. Assert log_file parameter to generate log file. """ - file_name = "progress_verbosity_3.log" + cachedir = mkdtemp() + file_name = cachedir + "progress_verbosity_3.log" tracking_progress_file = open(file_name, "w") tpot_obj = TPOTClassifier( population_size=POP_SIZE, @@ -76,6 +83,7 @@ def test_log_file_verbose_3(): tpot_obj.fit(X, y) assert_equal(os.path.getsize(file_name) > 0, True) check_generations(file_name, GEN_SIZE) + rmtree(cachedir) def check_generations(file_name, generations): """ Assert generation log message is present in log_file. """ diff --git a/tests/tpot_tests.py b/tests/tpot_tests.py index eedaff14..a50014fb 100644 --- a/tests/tpot_tests.py +++ b/tests/tpot_tests.py @@ -197,10 +197,14 @@ def test_init_custom_parameters(): def test_init_custom_progress_file(): """ Assert that TPOT has right file handler to save progress. """ - file_name = "progress.txt" + cachedir = mkdtemp() + file_name = cachedir + "/progress.log" file_handle = open(file_name, "w") tpot_obj = TPOTClassifier(log_file=file_handle) assert tpot_obj.log_file == file_handle + # clean up + rmtree(cachedir) + def test_init_default_scoring(): """Assert that TPOT intitializes with the correct default scoring function.""" From c2d41dbf0f1cf67704535861c64dfb502f11f740 Mon Sep 17 00:00:00 2001 From: Weixuan Date: Fri, 12 Jun 2020 11:11:11 -0400 Subject: [PATCH 04/39] remove progress.log --- tests/tpot_tests.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/tpot_tests.py b/tests/tpot_tests.py index a50014fb..0143a025 100644 --- a/tests/tpot_tests.py +++ b/tests/tpot_tests.py @@ -202,6 +202,7 @@ def test_init_custom_progress_file(): file_handle = open(file_name, "w") tpot_obj = TPOTClassifier(log_file=file_handle) assert tpot_obj.log_file == file_handle + file_handle.close() # clean up rmtree(cachedir) From 2a90723e75904932b82446705958678e23a1c92d Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Wed, 29 Jul 2020 09:54:06 -0700 Subject: [PATCH 05/39] use duck typing to check whether an operator is a classifier, regeressor, trasnformer, or selector --- tpot/operator_utils.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/tpot/operator_utils.py b/tpot/operator_utils.py index e66290c6..c0493ac0 100644 --- a/tpot/operator_utils.py +++ b/tpot/operator_utils.py @@ -25,6 +25,7 @@ import numpy as np from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin +from sklearn.base import is_classifier, is_regressor from sklearn.gaussian_process.kernels import Kernel try: from sklearn.feature_selection._base import SelectorMixin @@ -118,6 +119,20 @@ def set_sample_weight(pipeline_steps, sample_weight=None): return None +def _is_selector(estimator): + selector_attributes = [ + "get_support", + "transform", + "inverse_transform", + "fit_transform" + ] + return all(hasattr(estimator, attr) for attr in selector_attributes) + + +def _is_transformer(estimator): + return hasattr(estimator, "fit_transform") + + def ARGTypeClassFactory(classname, prange, BaseClass=ARGType): """Dynamically create parameter type class. @@ -178,15 +193,15 @@ def TPOTOperatorClassFactory(opsourse, opdict, BaseClass=Operator, ArgBaseClass= return None, None else: # define if the operator can be the root of a pipeline - if issubclass(op_obj, ClassifierMixin): + if is_classifier(op_obj): class_profile['root'] = True optype = "Classifier" - elif issubclass(op_obj, RegressorMixin): + elif is_regressor(op_obj): class_profile['root'] = True optype = "Regressor" - if issubclass(op_obj, TransformerMixin): + if _is_transformer(op_obj): optype = "Transformer" - if issubclass(op_obj, SelectorMixin): + if _is_selector(op_obj): optype = "Selector" @classmethod @@ -291,9 +306,9 @@ def export(cls, *args): doptype = dep_op_type[dep_op_pname] if inspect.isclass(doptype): # a estimator if issubclass(doptype, BaseEstimator) or \ - issubclass(doptype, ClassifierMixin) or \ - issubclass(doptype, RegressorMixin) or \ - issubclass(doptype, TransformerMixin) or \ + is_classifier(doptype) or \ + is_regressor(doptype) or \ + _is_transformer(doptype) or \ issubclass(doptype, Kernel): arg_value = "{}({})".format(dep_op_str, ", ".join(dep_op_arguments[dep_op_str])) tmp_op_args.append("{}={}".format(dep_op_pname, arg_value)) From a5a99ed598c322f5f38390fb796d23383385f2df Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Wed, 29 Jul 2020 10:20:42 -0700 Subject: [PATCH 06/39] cleanup now unused imports --- tpot/operator_utils.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tpot/operator_utils.py b/tpot/operator_utils.py index c0493ac0..889a835f 100644 --- a/tpot/operator_utils.py +++ b/tpot/operator_utils.py @@ -24,13 +24,9 @@ """ import numpy as np -from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin +from sklearn.base import BaseEstimator from sklearn.base import is_classifier, is_regressor from sklearn.gaussian_process.kernels import Kernel -try: - from sklearn.feature_selection._base import SelectorMixin -except ImportError: - from sklearn.feature_selection.base import SelectorMixin import inspect From ab47c7a97aad4a0f89f1438a20313fccdfdd738e Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Wed, 29 Jul 2020 11:24:07 -0700 Subject: [PATCH 07/39] combine sklearn.base imports into a single line --- tpot/operator_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tpot/operator_utils.py b/tpot/operator_utils.py index 889a835f..64b8ecf9 100644 --- a/tpot/operator_utils.py +++ b/tpot/operator_utils.py @@ -24,8 +24,7 @@ """ import numpy as np -from sklearn.base import BaseEstimator -from sklearn.base import is_classifier, is_regressor +from sklearn.base import BaseEstimator, is_classifier, is_regressor from sklearn.gaussian_process.kernels import Kernel import inspect From 8d4affc59e26252a0b3a645946a4100ce273f748 Mon Sep 17 00:00:00 2001 From: Weixuan Date: Mon, 3 Aug 2020 09:30:26 -0400 Subject: [PATCH 08/39] increase MAX_EVAL_SECS in pretest for dataset with large numbers of features #1088 --- tpot/decorators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpot/decorators.py b/tpot/decorators.py index aa949fee..3dac4b3c 100644 --- a/tpot/decorators.py +++ b/tpot/decorators.py @@ -33,7 +33,7 @@ NUM_TESTS = 10 -MAX_EVAL_SECS = 2 +MAX_EVAL_SECS = 10 def _pre_test(func): From f08acbfc44ae4a7757622aace82ec617974c7a82 Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Tue, 18 Aug 2020 09:39:12 -0700 Subject: [PATCH 09/39] duck typig in stacking estimator for classifier checking --- tpot/builtins/stacking_estimator.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tpot/builtins/stacking_estimator.py b/tpot/builtins/stacking_estimator.py index 4da1883a..7fcfcf07 100644 --- a/tpot/builtins/stacking_estimator.py +++ b/tpot/builtins/stacking_estimator.py @@ -24,7 +24,7 @@ """ import numpy as np -from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin +from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin, is_classifier from sklearn.utils import check_array @@ -83,7 +83,7 @@ def transform(self, X): X = check_array(X) X_transformed = np.copy(X) # add class probabilities as a synthetic feature - if issubclass(self.estimator.__class__, ClassifierMixin) and hasattr(self.estimator, 'predict_proba'): + if is_classifier(self.estimator) and hasattr(self.estimator, 'predict_proba'): y_pred_proba = self.estimator.predict_proba(X) # check all values that should be not infinity or not NAN if np.all(np.isfinite(y_pred_proba)): @@ -91,5 +91,4 @@ def transform(self, X): # add class prediction as a synthetic feature X_transformed = np.hstack((np.reshape(self.estimator.predict(X), (-1, 1)), X_transformed)) - return X_transformed From 931bff7a9bd702c497651b56802bef2967870824 Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Wed, 19 Aug 2020 15:25:26 +0000 Subject: [PATCH 10/39] remove unused classifiermixin import --- tpot/builtins/stacking_estimator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpot/builtins/stacking_estimator.py b/tpot/builtins/stacking_estimator.py index 7fcfcf07..9e513c3c 100644 --- a/tpot/builtins/stacking_estimator.py +++ b/tpot/builtins/stacking_estimator.py @@ -24,7 +24,7 @@ """ import numpy as np -from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin, is_classifier +from sklearn.base import BaseEstimator, TransformerMixin, is_classifier from sklearn.utils import check_array From d441de59f46439e5ae8ddaeff89c211863de9b96 Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Wed, 19 Aug 2020 20:04:28 +0000 Subject: [PATCH 11/39] add cuml classifier config to configuration options and config setup process --- tpot/base.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tpot/base.py b/tpot/base.py index 69361799..6ab941b1 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -71,6 +71,7 @@ from .config.regressor_sparse import regressor_config_sparse from .config.classifier_sparse import classifier_config_sparse from .config.classifier_nn import classifier_config_nn +from .config.classifier_cuml import classifier_config_cuml from .metrics import SCORERS from .gp_types import Output_Array @@ -345,6 +346,16 @@ def _setup_config(self, config_dict): self._config_dict = regressor_config_sparse elif config_dict == 'TPOT NN': self._config_dict = classifier_config_nn + elif config_dict == 'TPOT cuML': + if not _has_cuml(): + raise ValueError( + 'The GPU machine library cuML is not available. ' + 'To use cuML, please install cuML via conda.' + ) + if self.classification: + self._config_dict = classifier_config_cuml + else: + pass else: config = self._read_config_file(config_dict) if hasattr(config, 'tpot_config'): @@ -1721,3 +1732,11 @@ def _generate(self, pset, min_, max_, condition, type_=None): for arg in reversed(prim.args): stack.append((depth + 1, arg)) return expr + + +def _has_cuml(): + try: + import cuml + return True + except ImportError: + return False From 899fb01c6b70cc500a10f0552a28e95289aae5e9 Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Wed, 19 Aug 2020 22:43:49 +0000 Subject: [PATCH 12/39] cuml classifier and regressor configs --- tpot/base.py | 3 +- tpot/config/classifier_cuml.py | 116 ++++++++++++++++++++++++++++++ tpot/config/regressor_cuml.py | 124 +++++++++++++++++++++++++++++++++ 3 files changed, 242 insertions(+), 1 deletion(-) create mode 100644 tpot/config/classifier_cuml.py create mode 100644 tpot/config/regressor_cuml.py diff --git a/tpot/base.py b/tpot/base.py index 6ab941b1..398a9c4d 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -72,6 +72,7 @@ from .config.classifier_sparse import classifier_config_sparse from .config.classifier_nn import classifier_config_nn from .config.classifier_cuml import classifier_config_cuml +from .config.regressor_cuml import regressor_config_cuml from .metrics import SCORERS from .gp_types import Output_Array @@ -355,7 +356,7 @@ def _setup_config(self, config_dict): if self.classification: self._config_dict = classifier_config_cuml else: - pass + self._config_dict = regressor_config_cuml else: config = self._read_config_file(config_dict) if hasattr(config, 'tpot_config'): diff --git a/tpot/config/classifier_cuml.py b/tpot/config/classifier_cuml.py new file mode 100644 index 00000000..3ab0f6e5 --- /dev/null +++ b/tpot/config/classifier_cuml.py @@ -0,0 +1,116 @@ +# -*- coding: utf-8 -*- + +"""This file is part of the TPOT library. + +TPOT was primarily developed at the University of Pennsylvania by: + - Randal S. Olson (rso@randalolson.com) + - Weixuan Fu (weixuanf@upenn.edu) + - Daniel Angell (dpa34@drexel.edu) + - and many more generous open source contributors + +TPOT is free software: you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as +published by the Free Software Foundation, either version 3 of +the License, or (at your option) any later version. + +TPOT is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with TPOT. If not, see . + +""" + +import numpy as np + +# This configuration provides users with access to a GPU the ability to +# use cuML classifiers as estimators alongside the scikit-learn +# preprocessors in the TPOT default configuration. + +classifier_config_cuml = { + # cuML Classifiers + + 'cuml.neighbors.KNeighborsClassifier': { + 'n_neighbors': range(1, 101), + 'weights': ["uniform",], + }, + 'cuml.svm.SVC': { + 'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1,], + 'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.,] + }, + "cuml.ensemble.RandomForestClassifier": { + 'n_estimators': [100, 300, 500,], + 'split_algo': [0, 1,], + 'max_depth': range(8, 20), + 'max_features': np.arange(0.05, 1.01, 0.05), + 'min_rows_per_node': range(2, 21), + 'n_bins': [8, 64,] + }, + 'cuml.linear_model.LogisticRegression': { + 'penalty': ["l1", "l2", "elasticnet"], + 'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.,], + }, + + # Sklearn Preprocesssors + + 'sklearn.preprocessing.Binarizer': { + 'threshold': np.arange(0.0, 1.01, 0.05) + }, + + 'sklearn.decomposition.FastICA': { + 'tol': np.arange(0.0, 1.01, 0.05) + }, + + 'sklearn.cluster.FeatureAgglomeration': { + 'linkage': ['ward', 'complete', 'average'], + 'affinity': ['euclidean', 'l1', 'l2', 'manhattan', 'cosine'] + }, + + 'sklearn.preprocessing.MaxAbsScaler': { + }, + + 'sklearn.preprocessing.MinMaxScaler': { + }, + + 'sklearn.preprocessing.Normalizer': { + 'norm': ['l1', 'l2', 'max'] + }, + + 'sklearn.kernel_approximation.Nystroem': { + 'kernel': ['rbf', 'cosine', 'chi2', 'laplacian', 'polynomial', 'poly', 'linear', 'additive_chi2', 'sigmoid'], + 'gamma': np.arange(0.0, 1.01, 0.05), + 'n_components': range(1, 11) + }, + + 'sklearn.decomposition.PCA': { + 'svd_solver': ['randomized'], + 'iterated_power': range(1, 11) + }, + + 'sklearn.preprocessing.PolynomialFeatures': { + 'degree': [2], + 'include_bias': [False], + 'interaction_only': [False] + }, + + 'sklearn.kernel_approximation.RBFSampler': { + 'gamma': np.arange(0.0, 1.01, 0.05) + }, + + 'sklearn.preprocessing.RobustScaler': { + }, + + 'sklearn.preprocessing.StandardScaler': { + }, + + 'tpot.builtins.ZeroCount': { + }, + + 'tpot.builtins.OneHotEncoder': { + 'minimum_fraction': [0.05, 0.1, 0.15, 0.2, 0.25], + 'sparse': [False], + 'threshold': [10] + }, +} diff --git a/tpot/config/regressor_cuml.py b/tpot/config/regressor_cuml.py new file mode 100644 index 00000000..cd9d4872 --- /dev/null +++ b/tpot/config/regressor_cuml.py @@ -0,0 +1,124 @@ +# -*- coding: utf-8 -*- + +"""This file is part of the TPOT library. + +TPOT was primarily developed at the University of Pennsylvania by: + - Randal S. Olson (rso@randalolson.com) + - Weixuan Fu (weixuanf@upenn.edu) + - Daniel Angell (dpa34@drexel.edu) + - and many more generous open source contributors + +TPOT is free software: you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as +published by the Free Software Foundation, either version 3 of +the License, or (at your option) any later version. + +TPOT is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with TPOT. If not, see . + +""" + +import numpy as np + +# This configuration provides users with access to a GPU the ability to +# use cuML regressors as estimators alongside the scikit-learn +# preprocessors in the TPOT default configuration. + +regressor_config_cuml = { + # cuML Regressors + + 'cuml.linear_model.ElasticNet': { + 'l1_ratio': np.arange(0.0, 1.01, 0.05), + 'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1] + }, + + 'cuml.neighbors.KNeighborsRegressor': { + 'n_neighbors': range(1, 101), + 'weights': ["uniform"], + }, + + 'cuml.linear_model.Lasso': { + 'normalize': [True, False] + }, + + 'cuml.svm.SVR': { + 'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1,], + 'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.,] + }, + + 'cuml.ensemble.RandomForestRegressor': { + 'n_estimators': [100], + 'max_features': np.arange(0.05, 1.01, 0.05), + 'min_samples_split': range(2, 21), + 'min_samples_leaf': range(1, 21), + 'bootstrap': [True, False] + }, + + 'cuml.linear_model.Ridge': { + }, + + # Sklearn Preprocesssors + 'sklearn.preprocessing.Binarizer': { + 'threshold': np.arange(0.0, 1.01, 0.05) + }, + + 'sklearn.decomposition.FastICA': { + 'tol': np.arange(0.0, 1.01, 0.05) + }, + + 'sklearn.cluster.FeatureAgglomeration': { + 'linkage': ['ward', 'complete', 'average'], + 'affinity': ['euclidean', 'l1', 'l2', 'manhattan', 'cosine'] + }, + + 'sklearn.preprocessing.MaxAbsScaler': { + }, + + 'sklearn.preprocessing.MinMaxScaler': { + }, + + 'sklearn.preprocessing.Normalizer': { + 'norm': ['l1', 'l2', 'max'] + }, + + 'sklearn.kernel_approximation.Nystroem': { + 'kernel': ['rbf', 'cosine', 'chi2', 'laplacian', 'polynomial', 'poly', 'linear', 'additive_chi2', 'sigmoid'], + 'gamma': np.arange(0.0, 1.01, 0.05), + 'n_components': range(1, 11) + }, + + 'sklearn.decomposition.PCA': { + 'svd_solver': ['randomized'], + 'iterated_power': range(1, 11) + }, + + 'sklearn.preprocessing.PolynomialFeatures': { + 'degree': [2], + 'include_bias': [False], + 'interaction_only': [False] + }, + + 'sklearn.kernel_approximation.RBFSampler': { + 'gamma': np.arange(0.0, 1.01, 0.05) + }, + + 'sklearn.preprocessing.RobustScaler': { + }, + + 'sklearn.preprocessing.StandardScaler': { + }, + + 'tpot.builtins.ZeroCount': { + }, + + 'tpot.builtins.OneHotEncoder': { + 'minimum_fraction': [0.05, 0.1, 0.15, 0.2, 0.25], + 'sparse': [False], + 'threshold': [10] + }, +} From dbb94a6bff632f59f0c979a38c876cf1f15bdc67 Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Thu, 20 Aug 2020 03:17:54 +0000 Subject: [PATCH 13/39] clean up cuML check and valueerror --- tpot/base.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tpot/base.py b/tpot/base.py index 398a9c4d..cf3bcdbf 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -350,10 +350,10 @@ def _setup_config(self, config_dict): elif config_dict == 'TPOT cuML': if not _has_cuml(): raise ValueError( - 'The GPU machine library cuML is not available. ' - 'To use cuML, please install cuML via conda.' + 'The GPU machine learning library cuML is not ' + 'available. To use cuML, please install cuML via conda.' ) - if self.classification: + elif self.classification: self._config_dict = classifier_config_cuml else: self._config_dict = regressor_config_cuml @@ -1736,8 +1736,8 @@ def _generate(self, pset, min_, max_, condition, type_=None): def _has_cuml(): - try: - import cuml - return True - except ImportError: - return False + try: + import cuml + return True + except ImportError: + return False From 078e7757233e4b20b6e3fbba573ea437bb99508b Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Thu, 20 Aug 2020 03:48:29 +0000 Subject: [PATCH 14/39] cuML fit test for classifier --- tests/tpot_tests.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/tests/tpot_tests.py b/tests/tpot_tests.py index aef16a2b..8eb31653 100644 --- a/tests/tpot_tests.py +++ b/tests/tpot_tests.py @@ -24,7 +24,7 @@ """ from tpot import TPOTClassifier, TPOTRegressor -from tpot.base import TPOTBase +from tpot.base import TPOTBase, _has_cuml from tpot.driver import float_range from tpot.gp_types import Output_Array from tpot.gp_deap import mutNodeReplacement, _wrapped_cross_val_score, pick_two_individuals_eligible_for_crossover, cxOnePoint, varOr, initialize_stats_dict @@ -1104,6 +1104,26 @@ def test_fit_7(): assert not (tpot_obj._start_datetime is None) +def test_fit_cuml(): + """Assert that the TPOT fit function provides an optimized pipeline when config_dict is 'TPOT cuML' if cuML is available. If not available, assert _fit_init raises a ValueError.""" + + tpot_obj = TPOTClassifier( + random_state=42, + population_size=1, + offspring_size=2, + generations=1, + verbosity=0, + config_dict='TPOT cuML' + ) + + if _has_cuml(): + tpot_obj.fit(training_features, training_target) + assert isinstance(tpot_obj._optimized_pipeline, creator.Individual) + assert not (tpot_obj._start_datetime is None) + else: + assert_raises(ValueError, tpot_obj._fit_init) + + def test_memory(): """Assert that the TPOT fit function runs normally with memory=\'auto\'.""" tpot_obj = TPOTClassifier( From 3c15656360ff2ac6bc99d11a67b742114427ce84 Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Thu, 20 Aug 2020 03:51:12 +0000 Subject: [PATCH 15/39] test cuml config dict correctly loads --- tests/tpot_tests.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/tpot_tests.py b/tests/tpot_tests.py index 8eb31653..a6023367 100644 --- a/tests/tpot_tests.py +++ b/tests/tpot_tests.py @@ -39,6 +39,8 @@ from tpot.config.regressor_sparse import regressor_config_sparse from tpot.config.classifier_sparse import classifier_config_sparse from tpot.config.classifier_nn import classifier_config_nn +from tpot.config.classifier_cuml import classifier_config_cuml +from tpot.config.regressor_cuml import regressor_config_cuml import numpy as np import pandas as pd @@ -498,6 +500,10 @@ def test_conf_dict(): tpot_obj._fit_init() assert tpot_obj._config_dict == classifier_config_sparse + tpot_obj = TPOTClassifier(config_dict='TPOT cuML') + tpot_obj._fit_init() + assert tpot_obj._config_dict == classifier_config_cuml + tpot_obj = TPOTRegressor(config_dict='TPOT light') tpot_obj._fit_init() assert tpot_obj._config_dict == regressor_config_dict_light @@ -509,6 +515,10 @@ def test_conf_dict(): tpot_obj = TPOTRegressor(config_dict='TPOT sparse') tpot_obj._fit_init() assert tpot_obj._config_dict == regressor_config_sparse + + tpot_obj = TPOTRegressor(config_dict='TPOT cuML') + tpot_obj._fit_init() + assert tpot_obj._config_dict == regressor_config_cuml def test_conf_dict_2(): From cabdaf339b7b5854ffb1e9e02e21e3db576fae0a Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Thu, 20 Aug 2020 14:18:45 +0000 Subject: [PATCH 16/39] wrap config test in cuml available block --- tests/tpot_tests.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tests/tpot_tests.py b/tests/tpot_tests.py index a6023367..e4a92bde 100644 --- a/tests/tpot_tests.py +++ b/tests/tpot_tests.py @@ -500,10 +500,6 @@ def test_conf_dict(): tpot_obj._fit_init() assert tpot_obj._config_dict == classifier_config_sparse - tpot_obj = TPOTClassifier(config_dict='TPOT cuML') - tpot_obj._fit_init() - assert tpot_obj._config_dict == classifier_config_cuml - tpot_obj = TPOTRegressor(config_dict='TPOT light') tpot_obj._fit_init() assert tpot_obj._config_dict == regressor_config_dict_light @@ -515,10 +511,15 @@ def test_conf_dict(): tpot_obj = TPOTRegressor(config_dict='TPOT sparse') tpot_obj._fit_init() assert tpot_obj._config_dict == regressor_config_sparse - - tpot_obj = TPOTRegressor(config_dict='TPOT cuML') - tpot_obj._fit_init() - assert tpot_obj._config_dict == regressor_config_cuml + + if _has_cuml(): + tpot_obj = TPOTClassifier(config_dict='TPOT cuML') + tpot_obj._fit_init() + assert tpot_obj._config_dict == classifier_config_cuml + + tpot_obj = TPOTRegressor(config_dict='TPOT cuML') + tpot_obj._fit_init() + assert tpot_obj._config_dict == regressor_config_cuml def test_conf_dict_2(): From 10217b8e043df40a2a506e05277133ff9eb6f7d1 Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Thu, 20 Aug 2020 14:43:38 +0000 Subject: [PATCH 17/39] try hit the regressor config codepath in the tests for coverage --- tests/tpot_tests.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/tests/tpot_tests.py b/tests/tpot_tests.py index e4a92bde..5ca99204 100644 --- a/tests/tpot_tests.py +++ b/tests/tpot_tests.py @@ -1118,7 +1118,16 @@ def test_fit_7(): def test_fit_cuml(): """Assert that the TPOT fit function provides an optimized pipeline when config_dict is 'TPOT cuML' if cuML is available. If not available, assert _fit_init raises a ValueError.""" - tpot_obj = TPOTClassifier( + tpot_clf_obj = TPOTClassifier( + random_state=42, + population_size=1, + offspring_size=2, + generations=1, + verbosity=0, + config_dict='TPOT cuML' + ) + + tpot_regr_obj = TPOTRegressor( random_state=42, population_size=1, offspring_size=2, @@ -1128,11 +1137,16 @@ def test_fit_cuml(): ) if _has_cuml(): - tpot_obj.fit(training_features, training_target) - assert isinstance(tpot_obj._optimized_pipeline, creator.Individual) - assert not (tpot_obj._start_datetime is None) + tpot_clf_obj.fit(training_features, training_target) + assert isinstance(tpot_clf_obj._optimized_pipeline, creator.Individual) + assert not (tpot_clf_obj._start_datetime is None) + + tpot_regr_obj.fit(pretest_X_reg, pretest_y_reg) + assert isinstance(tpot_regr_obj._optimized_pipeline, creator.Individual) + assert not (tpot_regr_obj._start_datetime is None) else: - assert_raises(ValueError, tpot_obj._fit_init) + assert_raises(ValueError, tpot_clf_obj._fit_init) + assert_raises(ValueError, tpot_regr_obj._fit_init) def test_memory(): From 5cea7b29f7093613972308d4c56f47bf60e80410 Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Thu, 20 Aug 2020 18:55:35 +0000 Subject: [PATCH 18/39] cleanup configs --- tpot/config/classifier_cuml.py | 19 +++++++++++-------- tpot/config/regressor_cuml.py | 19 ++++++++++--------- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/tpot/config/classifier_cuml.py b/tpot/config/classifier_cuml.py index 3ab0f6e5..bf1efb16 100644 --- a/tpot/config/classifier_cuml.py +++ b/tpot/config/classifier_cuml.py @@ -31,16 +31,18 @@ classifier_config_cuml = { # cuML Classifiers - + 'cuml.neighbors.KNeighborsClassifier': { 'n_neighbors': range(1, 101), 'weights': ["uniform",], }, + 'cuml.svm.SVC': { 'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1,], 'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.,] }, - "cuml.ensemble.RandomForestClassifier": { + + 'cuml.ensemble.RandomForestClassifier': { 'n_estimators': [100, 300, 500,], 'split_algo': [0, 1,], 'max_depth': range(8, 20), @@ -48,13 +50,14 @@ 'min_rows_per_node': range(2, 21), 'n_bins': [8, 64,] }, + 'cuml.linear_model.LogisticRegression': { 'penalty': ["l1", "l2", "elasticnet"], 'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.,], }, - - # Sklearn Preprocesssors - + + # Sklearn + cuML Preprocesssors + 'sklearn.preprocessing.Binarizer': { 'threshold': np.arange(0.0, 1.01, 0.05) }, @@ -84,9 +87,9 @@ 'n_components': range(1, 11) }, - 'sklearn.decomposition.PCA': { - 'svd_solver': ['randomized'], - 'iterated_power': range(1, 11) + 'cuml.decomposition.PCA': { + 'svd_solver': ['jacobi'], + 'iterated_power': range(1, 11), }, 'sklearn.preprocessing.PolynomialFeatures': { diff --git a/tpot/config/regressor_cuml.py b/tpot/config/regressor_cuml.py index cd9d4872..67bedb34 100644 --- a/tpot/config/regressor_cuml.py +++ b/tpot/config/regressor_cuml.py @@ -36,7 +36,7 @@ 'l1_ratio': np.arange(0.0, 1.01, 0.05), 'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1] }, - + 'cuml.neighbors.KNeighborsRegressor': { 'n_neighbors': range(1, 101), 'weights': ["uniform"], @@ -52,17 +52,18 @@ }, 'cuml.ensemble.RandomForestRegressor': { - 'n_estimators': [100], + 'n_estimators': [100, 300, 500,], + 'split_algo': [0, 1,], + 'max_depth': range(8, 20), 'max_features': np.arange(0.05, 1.01, 0.05), - 'min_samples_split': range(2, 21), - 'min_samples_leaf': range(1, 21), - 'bootstrap': [True, False] + 'min_rows_per_node': range(2, 21), + 'n_bins': [8, 64,] }, 'cuml.linear_model.Ridge': { }, - # Sklearn Preprocesssors + # Sklearn + cuML Preprocesssors 'sklearn.preprocessing.Binarizer': { 'threshold': np.arange(0.0, 1.01, 0.05) }, @@ -92,9 +93,9 @@ 'n_components': range(1, 11) }, - 'sklearn.decomposition.PCA': { - 'svd_solver': ['randomized'], - 'iterated_power': range(1, 11) + 'cuml.decomposition.PCA': { + 'svd_solver': ['jacobi'], + 'iterated_power': range(1, 11), }, 'sklearn.preprocessing.PolynomialFeatures': { From df00d7dfe87b3fa5c91dad4e1119732173a79a43 Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Thu, 20 Aug 2020 18:57:34 +0000 Subject: [PATCH 19/39] cuML + TPOT example notebooks (regression and classificaton) --- tutorials/cuML_Classification_Example.ipynb | 182 ++++++++++++++++++++ tutorials/cuML_Regression_Example.ipynb | 170 ++++++++++++++++++ 2 files changed, 352 insertions(+) create mode 100644 tutorials/cuML_Classification_Example.ipynb create mode 100644 tutorials/cuML_Regression_Example.ipynb diff --git a/tutorials/cuML_Classification_Example.ipynb b/tutorials/cuML_Classification_Example.ipynb new file mode 100644 index 00000000..02072e32 --- /dev/null +++ b/tutorials/cuML_Classification_Example.ipynb @@ -0,0 +1,182 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook walks through a basic example of using [RAPIDS](https://rapids.ai/) cuML estimators with TPOT for classification tasks. You must have access to an NVIDIA GPU and have cuML installed in your environment. Running this notebook without cuML, will cause TPOT to raise a `ValueError` indicating you should install cuML." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from tpot import TPOTClassifier\n", + "from sklearn.datasets import make_classification\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "NSAMPLES = 50000\n", + "NFEATURES = 20\n", + "SEED = 12\n", + "\n", + "# For cuML with TPOT, you must use CPU data (such as NumPy arrays)\n", + "X, y = make_classification(\n", + " n_samples=NSAMPLES,\n", + " n_features=NFEATURES,\n", + " n_informative=NFEATURES,\n", + " n_redundant=0,\n", + " class_sep=0.75,\n", + " n_classes=2,\n", + " random_state=SEED,\n", + " \n", + ")\n", + "\n", + "X = X.astype(\"float32\")\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=SEED)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that for cuML to work correctly, you must set `n_jobs=1` (the default setting)." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=60.0, style=ProgressStyle(des…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Generation 1 - Current best internal CV score: 0.9822666666666667\n", + "Generation 2 - Current best internal CV score: 0.9822666666666667\n", + "Generation 3 - Current best internal CV score: 0.9822666666666667\n", + "Generation 4 - Current best internal CV score: 0.9840800000000001\n", + "Generation 5 - Current best internal CV score: 0.9840800000000001\n", + "Best pipeline: SVC(OneHotEncoder(FastICA(input_matrix, tol=0.9500000000000001), minimum_fraction=0.15, sparse=False, threshold=10), C=5.0, tol=0.001)\n", + "0.98584\n" + ] + } + ], + "source": [ + "# TPOT setup\n", + "GENERATIONS = 5\n", + "POP_SIZE = 10\n", + "CV = 3\n", + "\n", + "tpot = TPOTClassifier(\n", + " generations=GENERATIONS,\n", + " population_size=POP_SIZE,\n", + " random_state=SEED,\n", + " config_dict=\"TPOT cuML\",\n", + " n_jobs=1, # cuML requires n_jobs=1, the default\n", + " cv=CV,\n", + " verbosity=2,\n", + ")\n", + "\n", + "tpot.fit(X_train, y_train)\n", + "print(tpot.score(X_test, y_test))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "import numpy as np\n", + "import pandas as pd\n", + "from cuml.svm import SVC\n", + "from sklearn.decomposition import FastICA\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.pipeline import make_pipeline\n", + "from tpot.builtins import OneHotEncoder\n", + "from tpot.export_utils import set_param_recursive\n", + "\n", + "# NOTE: Make sure that the outcome column is labeled 'target' in the data file\n", + "tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)\n", + "features = tpot_data.drop('target', axis=1)\n", + "training_features, testing_features, training_target, testing_target = \\\n", + " train_test_split(features, tpot_data['target'], random_state=12)\n", + "\n", + "# Average CV score on the training set was: 0.9840800000000001\n", + "exported_pipeline = make_pipeline(\n", + " FastICA(tol=0.9500000000000001),\n", + " OneHotEncoder(minimum_fraction=0.15, sparse=False, threshold=10),\n", + " SVC(C=5.0, tol=0.001)\n", + ")\n", + "# Fix random state for all the steps in exported pipeline\n", + "set_param_recursive(exported_pipeline.steps, 'random_state', 12)\n", + "\n", + "exported_pipeline.fit(training_features, training_target)\n", + "results = exported_pipeline.predict(testing_features)\n", + "\n" + ] + } + ], + "source": [ + "tpot.export('tpot_classification_cuml_pipeline.py')\n", + "print(tpot.export())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tutorials/cuML_Regression_Example.ipynb b/tutorials/cuML_Regression_Example.ipynb new file mode 100644 index 00000000..c6b49843 --- /dev/null +++ b/tutorials/cuML_Regression_Example.ipynb @@ -0,0 +1,170 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook walks through a basic example of using [RAPIDS](https://rapids.ai/) cuML estimators with TPOT for regression problems. You must have access to an NVIDIA GPU and have cuML installed in your environment. Running this notebook without cuML, will cause TPOT to raise a `ValueError` indicating you should install cuML." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from tpot import TPOTRegressor\n", + "from sklearn.datasets import make_regression\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "NSAMPLES = 50000\n", + "NFEATURES = 20\n", + "SEED = 12\n", + "\n", + "# For cuML with TPOT, you must use CPU data (such as NumPy arrays)\n", + "X, y = make_regression(\n", + " n_samples=NSAMPLES,\n", + " n_features=NFEATURES,\n", + " n_informative=NFEATURES,\n", + " random_state=SEED,\n", + " noise=200,\n", + ")\n", + "\n", + "X = X.astype(\"float32\")\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=12)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that for cuML to work correctly, you must set `n_jobs=1` (the default setting)." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=60.0, style=ProgressStyle(des…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Generation 1 - Current best internal CV score: -39782.747480986516\n", + "Generation 2 - Current best internal CV score: -39782.747480986516\n", + "Generation 3 - Current best internal CV score: -39782.74741899271\n", + "Generation 4 - Current best internal CV score: -39782.74728636739\n", + "Generation 5 - Current best internal CV score: -39782.73498449405\n", + "Best pipeline: Ridge(ElasticNet(input_matrix, l1_ratio=0.45, tol=0.1))\n", + "-40365.10253091067\n" + ] + } + ], + "source": [ + "# TPOT setup\n", + "GENERATIONS = 5\n", + "POP_SIZE = 10\n", + "CV = 3\n", + "\n", + "tpot = TPOTRegressor(\n", + " generations=GENERATIONS,\n", + " population_size=POP_SIZE,\n", + " random_state=SEED,\n", + " config_dict=\"TPOT cuML\",\n", + " n_jobs=1, # cuML requires n_jobs=1\n", + " cv=CV,\n", + " verbosity=2,\n", + ")\n", + "\n", + "tpot.fit(X_train, y_train)\n", + "print(tpot.score(X_test, y_test))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "import numpy as np\n", + "import pandas as pd\n", + "from cuml.linear_model import ElasticNet, Ridge\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.pipeline import make_pipeline, make_union\n", + "from tpot.builtins import StackingEstimator\n", + "from tpot.export_utils import set_param_recursive\n", + "\n", + "# NOTE: Make sure that the outcome column is labeled 'target' in the data file\n", + "tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)\n", + "features = tpot_data.drop('target', axis=1)\n", + "training_features, testing_features, training_target, testing_target = \\\n", + " train_test_split(features, tpot_data['target'], random_state=12)\n", + "\n", + "# Average CV score on the training set was: -39782.73498449405\n", + "exported_pipeline = make_pipeline(\n", + " StackingEstimator(estimator=ElasticNet(l1_ratio=0.45, tol=0.1)),\n", + " Ridge()\n", + ")\n", + "# Fix random state for all the steps in exported pipeline\n", + "set_param_recursive(exported_pipeline.steps, 'random_state', 12)\n", + "\n", + "exported_pipeline.fit(training_features, training_target)\n", + "results = exported_pipeline.predict(testing_features)\n", + "\n" + ] + } + ], + "source": [ + "tpot.export('tpot_regression_cuml_pipeline.py')\n", + "print(tpot.export())" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 7633284383f044ac03904fd5e9e228a5cad85c0b Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Fri, 21 Aug 2020 17:24:33 +0000 Subject: [PATCH 20/39] cut off svr tolerance minimum at 1e-4 --- tpot/config/regressor_cuml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpot/config/regressor_cuml.py b/tpot/config/regressor_cuml.py index 67bedb34..42094c32 100644 --- a/tpot/config/regressor_cuml.py +++ b/tpot/config/regressor_cuml.py @@ -47,7 +47,7 @@ }, 'cuml.svm.SVR': { - 'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1,], + 'tol': [1e-4, 1e-3, 1e-2, 1e-1,], 'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.,] }, From fd71b8b9fb09d696dd5ebf059e3f2c699c403310 Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Fri, 21 Aug 2020 18:36:46 -0400 Subject: [PATCH 21/39] update using.md for the cuML config --- docs_sources/using.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs_sources/using.md b/docs_sources/using.md index e2687a90..a4e1bcf2 100644 --- a/docs_sources/using.md +++ b/docs_sources/using.md @@ -457,6 +457,16 @@ Currently only classification is supported, but future releases will include reg Classification + +TPOT-cuML +TPOT will search over a restricted configuration using the GPU-accelerated estimators in RAPIDS cuML. This configuration requires an NVIDIA Pascal architecture or better GPU with compute capability 6.0+, and that the library cuML is installed. With this configuration, all model training and predicting will be GPU-accelerated. +

+This configuration is useful for medium-sized and larger datasets on which CPU-based estimators are a common bottleneck, and works for both the TPOTClassifier and TPOTRegressor. +Classification +

+Regression + + To use any of these configurations, simply pass the string name of the configuration to the `config_dict` parameter (or `-config` on the command line). For example, to use the "TPOT light" configuration: From eb3909bf3767ceb38119bc93f055650b60cced23 Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Wed, 26 Aug 2020 15:05:29 +0000 Subject: [PATCH 22/39] update cuml config and add larger-scale example notebook on real data to highlight the impact --- tpot/config/classifier_cuml.py | 121 +++++++------ tutorials/Higgs_Boson.ipynb | 307 +++++++++++++++++++++++++++++++++ 2 files changed, 377 insertions(+), 51 deletions(-) create mode 100644 tutorials/Higgs_Boson.ipynb diff --git a/tpot/config/classifier_cuml.py b/tpot/config/classifier_cuml.py index bf1efb16..69e8b12e 100644 --- a/tpot/config/classifier_cuml.py +++ b/tpot/config/classifier_cuml.py @@ -26,94 +26,113 @@ import numpy as np # This configuration provides users with access to a GPU the ability to -# use cuML classifiers as estimators alongside the scikit-learn -# preprocessors in the TPOT default configuration. +# use RAPIDS cuML and DMLC/XGBoost classifiers as estimators alongside +# the scikit-learn preprocessors in the TPOT default configuration. classifier_config_cuml = { - # cuML Classifiers + # cuML + XGboost Classifiers - 'cuml.neighbors.KNeighborsClassifier': { - 'n_neighbors': range(1, 101), - 'weights': ["uniform",], + "cuml.neighbors.KNeighborsClassifier": { + "n_neighbors": range(1, 101), + "weights": ["uniform",], }, - 'cuml.svm.SVC': { - 'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1,], - 'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.,] + "cuml.ensemble.RandomForestClassifier": { + "n_estimators": [100, 300], + "split_algo": [0, 1], + "max_depth": range(8, 16), + "max_features": np.arange(0.05, 1.01, 0.05), + "min_rows_per_node": range(2, 21), + "n_bins": [64,] }, - 'cuml.ensemble.RandomForestClassifier': { - 'n_estimators': [100, 300, 500,], - 'split_algo': [0, 1,], - 'max_depth': range(8, 20), - 'max_features': np.arange(0.05, 1.01, 0.05), - 'min_rows_per_node': range(2, 21), - 'n_bins': [8, 64,] + "cuml.linear_model.LogisticRegression": { + "penalty": ["l1", "l2", "elasticnet"], + "C": [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.,], }, - 'cuml.linear_model.LogisticRegression': { - 'penalty': ["l1", "l2", "elasticnet"], - 'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.,], + "xgboost.XGBClassifier": { + "n_estimators": [100], + "max_depth": range(3, 10), + "learning_rate": [1e-2, 1e-1, 0.5, 1.], + "subsample": np.arange(0.05, 1.01, 0.05), + "min_child_weight": range(1, 21), + "alpha": [1, 10], + "tree_method": ["gpu_hist"], + "nthread": [1] }, - # Sklearn + cuML Preprocesssors + # Sklearn Preprocesssors - 'sklearn.preprocessing.Binarizer': { - 'threshold': np.arange(0.0, 1.01, 0.05) + "sklearn.preprocessing.Binarizer": { + "threshold": np.arange(0.0, 1.01, 0.05) }, - 'sklearn.decomposition.FastICA': { - 'tol': np.arange(0.0, 1.01, 0.05) + "sklearn.decomposition.FastICA": { + "tol": np.arange(0.0, 1.01, 0.05) }, - 'sklearn.cluster.FeatureAgglomeration': { - 'linkage': ['ward', 'complete', 'average'], - 'affinity': ['euclidean', 'l1', 'l2', 'manhattan', 'cosine'] + "sklearn.cluster.FeatureAgglomeration": { + "linkage": ["ward", "complete", "average"], + "affinity": ["euclidean", "l1", "l2", "manhattan", "cosine"] }, - 'sklearn.preprocessing.MaxAbsScaler': { + "sklearn.preprocessing.MaxAbsScaler": { }, - 'sklearn.preprocessing.MinMaxScaler': { + "sklearn.preprocessing.MinMaxScaler": { }, - 'sklearn.preprocessing.Normalizer': { - 'norm': ['l1', 'l2', 'max'] + "sklearn.preprocessing.Normalizer": { + "norm": ["l1", "l2", "max"] }, - 'sklearn.kernel_approximation.Nystroem': { - 'kernel': ['rbf', 'cosine', 'chi2', 'laplacian', 'polynomial', 'poly', 'linear', 'additive_chi2', 'sigmoid'], - 'gamma': np.arange(0.0, 1.01, 0.05), - 'n_components': range(1, 11) + "sklearn.kernel_approximation.Nystroem": { + "kernel": ["rbf", "cosine", "chi2", "laplacian", "polynomial", "poly", "linear", "additive_chi2", "sigmoid"], + "gamma": np.arange(0.0, 1.01, 0.05), + "n_components": range(1, 11) }, - 'cuml.decomposition.PCA': { - 'svd_solver': ['jacobi'], - 'iterated_power': range(1, 11), + "sklearn.decomposition.PCA": { + "svd_solver": ["randomized"], + "iterated_power": range(1, 11) }, - 'sklearn.preprocessing.PolynomialFeatures': { - 'degree': [2], - 'include_bias': [False], - 'interaction_only': [False] + "sklearn.kernel_approximation.RBFSampler": { + "gamma": np.arange(0.0, 1.01, 0.05) }, - 'sklearn.kernel_approximation.RBFSampler': { - 'gamma': np.arange(0.0, 1.01, 0.05) + "sklearn.preprocessing.RobustScaler": { }, - 'sklearn.preprocessing.RobustScaler': { + "sklearn.preprocessing.StandardScaler": { }, - 'sklearn.preprocessing.StandardScaler': { + "tpot.builtins.ZeroCount": { }, - 'tpot.builtins.ZeroCount': { + "tpot.builtins.OneHotEncoder": { + "minimum_fraction": [0.05, 0.1, 0.15, 0.2, 0.25], + "sparse": [False], + "threshold": [10] }, - 'tpot.builtins.OneHotEncoder': { - 'minimum_fraction': [0.05, 0.1, 0.15, 0.2, 0.25], - 'sparse': [False], - 'threshold': [10] + # Selectors + "sklearn.feature_selection.SelectFwe": { + "alpha": np.arange(0, 0.05, 0.001), + "score_func": { + "sklearn.feature_selection.f_classif": None + } }, + + "sklearn.feature_selection.SelectPercentile": { + "percentile": range(1, 100), + "score_func": { + "sklearn.feature_selection.f_classif": None + } + }, + + "sklearn.feature_selection.VarianceThreshold": { + "threshold": [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.2] + } } diff --git a/tutorials/Higgs_Boson.ipynb b/tutorials/Higgs_Boson.ipynb new file mode 100644 index 00000000..ad7f7aec --- /dev/null +++ b/tutorials/Higgs_Boson.ipynb @@ -0,0 +1,307 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook walks through a basic example of using the GPU-accelerated estimators from [RAPIDS](https://rapids.ai/) cuML and [DMLC/XGBoost](https://github.com/dmlc/xgboost) with TPOT for classification tasks. You must have access to an NVIDIA GPU and have cuML installed in your environment. Running this notebook without cuML, will cause TPOT to raise a `ValueError` indicating you should install cuML.\n", + "\n", + "It is intended to show how the `TPOT cuML` configuration can provide significant benefits on medium-sized and larger datasets. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Downloading Data\n", + "\n", + "This example uses the Higgs Boson [dataset](https://archive.ics.uci.edu/ml/datasets/HIGGS) from the UCI Machine Learning Repositoru." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "from tpot import TPOTClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# This is a 2.7 GB file.\n", + "# Please make sure you have space before uncommenting the code below and downloading this file.\n", + "\n", + "if not os.path.isfile(\"HIGGS.csv.gz\"):\n", + " !wget https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# This fuction is borrowed from https://github.com/NVIDIA/gbm-bench/blob/master/datasets.py\n", + "# Thanks!\n", + "\n", + "def prepare_higgs(dataset_folder, nrows=None):\n", + " higgs = pd.read_csv(\"HIGGS.csv.gz\", nrows=nrows)\n", + " X = higgs.iloc[:, 1:].to_numpy(dtype=np.float32)\n", + " y = higgs.iloc[:, 0].to_numpy(dtype=np.int64)\n", + " return train_test_split(X, y, stratify=y, random_state=77, test_size=0.2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Running TPOTClassifier" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the interest of time, we'll only use a 500,000 row sample of this file. 500,000 rows is more than enough for this example.\n", + "\n", + "With the example configuration below (10 generations, population size of 10, two-fold cross validation), the `TPOT cuML` configuration provides a significant speedup.\n", + "\n", + "Such speedups also mean you can create larger evolutionary search strategies while **still** returning faster results." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "NROWS = 500_000\n", + "X_train, X_test, y_train, y_test = prepare_higgs(\"./\", nrows=NROWS)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that for cuML to work correctly, you must set `n_jobs=1` (the default setting)." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=110.0, style=ProgressStyle(de…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Generation 1 - Current best internal CV score: 0.7103025000000001\n", + "Generation 2 - Current best internal CV score: 0.71385\n", + "Generation 3 - Current best internal CV score: 0.725755\n", + "Generation 4 - Current best internal CV score: 0.7299725\n", + "Generation 5 - Current best internal CV score: 0.7299725\n", + "Generation 6 - Current best internal CV score: 0.7299725\n", + "Generation 7 - Current best internal CV score: 0.7309975\n", + "Generation 8 - Current best internal CV score: 0.7309975\n", + "Generation 9 - Current best internal CV score: 0.7309975\n", + "Generation 10 - Current best internal CV score: 0.7309975\n", + "Best pipeline: XGBClassifier(ZeroCount(input_matrix), alpha=1, learning_rate=0.1, max_depth=6, min_child_weight=13, n_estimators=100, nthread=1, subsample=0.8500000000000001, tree_method=gpu_hist)\n", + "CPU times: user 4min 59s, sys: 13min 27s, total: 18min 27s\n", + "Wall time: 18min 29s\n" + ] + }, + { + "data": { + "text/plain": [ + "TPOTClassifier(config_dict='TPOT cuML', cv=2, generations=10,\n", + " log_file=,\n", + " population_size=10, random_state=12, verbosity=2)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "\n", + "# cuML TPOT setup\n", + "SEED = 12\n", + "GENERATIONS = 10\n", + "POP_SIZE = 10\n", + "CV = 2\n", + "\n", + "tpot = TPOTClassifier(\n", + " generations=GENERATIONS,\n", + " population_size=POP_SIZE,\n", + " random_state=SEED,\n", + " config_dict=\"TPOT cuML\",\n", + " n_jobs=1, # cuML requires n_jobs=1, the default\n", + " cv=CV,\n", + " verbosity=2,\n", + ")\n", + "\n", + "tpot.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.7308499813079834\n", + "CPU times: user 565 ms, sys: 5.52 ms, total: 570 ms\n", + "Wall time: 569 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "preds = tpot.predict(X_test)\n", + "print(accuracy_score(y_test, preds))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=110.0, style=ProgressStyle(de…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Generation 1 - Current best internal CV score: 0.7184675\n", + "Generation 2 - Current best internal CV score: 0.7184675\n", + "Generation 3 - Current best internal CV score: 0.7198\n", + "Generation 4 - Current best internal CV score: 0.7210825000000001\n", + "Generation 5 - Current best internal CV score: 0.7222999999999999\n", + "Generation 6 - Current best internal CV score: 0.7222999999999999\n", + "Generation 7 - Current best internal CV score: 0.7270125000000001\n", + "Generation 8 - Current best internal CV score: 0.73546\n", + "Generation 9 - Current best internal CV score: 0.73546\n", + "Generation 10 - Current best internal CV score: 0.735545\n", + "Best pipeline: XGBClassifier(OneHotEncoder(input_matrix, minimum_fraction=0.2, sparse=False, threshold=10), learning_rate=0.1, max_depth=9, min_child_weight=19, n_estimators=100, nthread=1, subsample=1.0)\n", + "CPU times: user 10min, sys: 1min 8s, total: 11min 9s\n", + "Wall time: 5h 17min 28s\n" + ] + }, + { + "data": { + "text/plain": [ + "TPOTClassifier(cv=2, generations=10,\n", + " log_file=,\n", + " n_jobs=-1, population_size=10, random_state=12, verbosity=2)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "\n", + "# Default TPOT setup with same params\n", + "tpot = TPOTClassifier(\n", + " generations=GENERATIONS,\n", + " population_size=POP_SIZE,\n", + " random_state=SEED,\n", + " n_jobs=-1,\n", + " cv=CV,\n", + " verbosity=2,\n", + ")\n", + "\n", + "tpot.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.7378900051116943\n", + "CPU times: user 968 ms, sys: 0 ns, total: 968 ms\n", + "Wall time: 967 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "preds = tpot.predict(X_test)\n", + "print(accuracy_score(y_test, preds))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 29ee28f40ecfd1b6dd27c80b9a1e824b11bbbc19 Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Wed, 26 Aug 2020 20:57:06 +0000 Subject: [PATCH 23/39] clean up configs; update notebooks --- tpot/config/classifier_cuml.py | 3 +- tpot/config/regressor_cuml.py | 130 ++++++++++++-------- tutorials/Higgs_Boson.ipynb | 73 ++++++----- tutorials/cuML_Classification_Example.ipynb | 63 ++++------ tutorials/cuML_Regression_Example.ipynb | 41 +++--- 5 files changed, 166 insertions(+), 144 deletions(-) diff --git a/tpot/config/classifier_cuml.py b/tpot/config/classifier_cuml.py index 69e8b12e..010a058f 100644 --- a/tpot/config/classifier_cuml.py +++ b/tpot/config/classifier_cuml.py @@ -30,7 +30,7 @@ # the scikit-learn preprocessors in the TPOT default configuration. classifier_config_cuml = { - # cuML + XGboost Classifiers + # cuML + DMLC/XGBoost Classifiers "cuml.neighbors.KNeighborsClassifier": { "n_neighbors": range(1, 101), @@ -118,6 +118,7 @@ }, # Selectors + "sklearn.feature_selection.SelectFwe": { "alpha": np.arange(0, 0.05, 0.001), "score_func": { diff --git a/tpot/config/regressor_cuml.py b/tpot/config/regressor_cuml.py index 42094c32..3d61720c 100644 --- a/tpot/config/regressor_cuml.py +++ b/tpot/config/regressor_cuml.py @@ -26,100 +26,122 @@ import numpy as np # This configuration provides users with access to a GPU the ability to -# use cuML regressors as estimators alongside the scikit-learn -# preprocessors in the TPOT default configuration. +# use RAPIDS cuML and DMLC/XGBoost regressors as estimators alongside +# the scikit-learn preprocessors in the TPOT default configuration. regressor_config_cuml = { - # cuML Regressors + # cuML + DMLC/XGBoost Regressors - 'cuml.linear_model.ElasticNet': { - 'l1_ratio': np.arange(0.0, 1.01, 0.05), - 'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1] + "cuml.linear_model.ElasticNet": { + "l1_ratio": np.arange(0.0, 1.01, 0.05), + "tol": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1] }, - 'cuml.neighbors.KNeighborsRegressor': { - 'n_neighbors': range(1, 101), - 'weights': ["uniform"], + "cuml.neighbors.KNeighborsRegressor": { + "n_neighbors": range(1, 101), + "weights": ["uniform"], }, - 'cuml.linear_model.Lasso': { - 'normalize': [True, False] + "cuml.linear_model.Lasso": { + "normalize": [True, False] }, - 'cuml.svm.SVR': { - 'tol': [1e-4, 1e-3, 1e-2, 1e-1,], - 'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.,] + "cuml.ensemble.RandomForestRegressor": { + "n_estimators": [100, 300, 500,], + "split_algo": [0, 1,], + "max_depth": range(8, 20), + "max_features": np.arange(0.05, 1.01, 0.05), + "min_rows_per_node": range(2, 21), + "n_bins": [64,] }, - 'cuml.ensemble.RandomForestRegressor': { - 'n_estimators': [100, 300, 500,], - 'split_algo': [0, 1,], - 'max_depth': range(8, 20), - 'max_features': np.arange(0.05, 1.01, 0.05), - 'min_rows_per_node': range(2, 21), - 'n_bins': [8, 64,] + "cuml.linear_model.Ridge": { }, - 'cuml.linear_model.Ridge': { + "xgboost.XGBRegressor": { + "n_estimators": [100], + "max_depth": range(3, 10), + "learning_rate": [1e-2, 1e-1, 0.5, 1.], + "subsample": np.arange(0.05, 1.01, 0.05), + "min_child_weight": range(1, 21), + "alpha": [1, 10], + "tree_method": ["gpu_hist"], + "nthread": [1], + "objective": ["reg:squarederror"] }, - # Sklearn + cuML Preprocesssors - 'sklearn.preprocessing.Binarizer': { - 'threshold': np.arange(0.0, 1.01, 0.05) + # Sklearn Preprocesssors + + "sklearn.preprocessing.Binarizer": { + "threshold": np.arange(0.0, 1.01, 0.05) }, - 'sklearn.decomposition.FastICA': { - 'tol': np.arange(0.0, 1.01, 0.05) + "sklearn.decomposition.FastICA": { + "tol": np.arange(0.0, 1.01, 0.05) }, - 'sklearn.cluster.FeatureAgglomeration': { - 'linkage': ['ward', 'complete', 'average'], - 'affinity': ['euclidean', 'l1', 'l2', 'manhattan', 'cosine'] + "sklearn.cluster.FeatureAgglomeration": { + "linkage": ["ward", "complete", "average"], + "affinity": ["euclidean", "l1", "l2", "manhattan", "cosine"] }, - 'sklearn.preprocessing.MaxAbsScaler': { + "sklearn.preprocessing.MaxAbsScaler": { }, - 'sklearn.preprocessing.MinMaxScaler': { + "sklearn.preprocessing.MinMaxScaler": { }, - 'sklearn.preprocessing.Normalizer': { - 'norm': ['l1', 'l2', 'max'] + "sklearn.preprocessing.Normalizer": { + "norm": ["l1", "l2", "max"] }, - 'sklearn.kernel_approximation.Nystroem': { - 'kernel': ['rbf', 'cosine', 'chi2', 'laplacian', 'polynomial', 'poly', 'linear', 'additive_chi2', 'sigmoid'], - 'gamma': np.arange(0.0, 1.01, 0.05), - 'n_components': range(1, 11) + "sklearn.kernel_approximation.Nystroem": { + "kernel": ["rbf", "cosine", "chi2", "laplacian", "polynomial", "poly", "linear", "additive_chi2", "sigmoid"], + "gamma": np.arange(0.0, 1.01, 0.05), + "n_components": range(1, 11) }, - 'cuml.decomposition.PCA': { - 'svd_solver': ['jacobi'], - 'iterated_power': range(1, 11), + "sklearn.decomposition.PCA": { + "svd_solver": ["randomized"], + "iterated_power": range(1, 11) }, - 'sklearn.preprocessing.PolynomialFeatures': { - 'degree': [2], - 'include_bias': [False], - 'interaction_only': [False] + "sklearn.kernel_approximation.RBFSampler": { + "gamma": np.arange(0.0, 1.01, 0.05) }, - 'sklearn.kernel_approximation.RBFSampler': { - 'gamma': np.arange(0.0, 1.01, 0.05) + "sklearn.preprocessing.RobustScaler": { }, - 'sklearn.preprocessing.RobustScaler': { + "sklearn.preprocessing.StandardScaler": { }, - 'sklearn.preprocessing.StandardScaler': { + "tpot.builtins.ZeroCount": { }, - 'tpot.builtins.ZeroCount': { + "tpot.builtins.OneHotEncoder": { + "minimum_fraction": [0.05, 0.1, 0.15, 0.2, 0.25], + "sparse": [False], + "threshold": [10] }, - 'tpot.builtins.OneHotEncoder': { - 'minimum_fraction': [0.05, 0.1, 0.15, 0.2, 0.25], - 'sparse': [False], - 'threshold': [10] + # Selectors + + "sklearn.feature_selection.SelectFwe": { + "alpha": np.arange(0, 0.05, 0.001), + "score_func": { + "sklearn.feature_selection.f_classif": None + } }, + + "sklearn.feature_selection.SelectPercentile": { + "percentile": range(1, 100), + "score_func": { + "sklearn.feature_selection.f_classif": None + } + }, + + "sklearn.feature_selection.VarianceThreshold": { + "threshold": [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.2] + } } diff --git a/tutorials/Higgs_Boson.ipynb b/tutorials/Higgs_Boson.ipynb index ad7f7aec..e0e4122a 100644 --- a/tutorials/Higgs_Boson.ipynb +++ b/tutorials/Higgs_Boson.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This notebook walks through a basic example of using the GPU-accelerated estimators from [RAPIDS](https://rapids.ai/) cuML and [DMLC/XGBoost](https://github.com/dmlc/xgboost) with TPOT for classification tasks. You must have access to an NVIDIA GPU and have cuML installed in your environment. Running this notebook without cuML, will cause TPOT to raise a `ValueError` indicating you should install cuML.\n", + "This notebook walks through a basic example of using the GPU-accelerated estimators from [RAPIDS](https://rapids.ai/) cuML and [DMLC/XGBoost](https://github.com/dmlc/xgboost) with TPOT for classification tasks. You must have access to an NVIDIA GPU and have cuML installed in your environment. Running this notebook without cuML will cause TPOT to raise a `ValueError`, indicating you should install cuML.\n", "\n", "It is intended to show how the `TPOT cuML` configuration can provide significant benefits on medium-sized and larger datasets. " ] @@ -15,15 +15,17 @@ "source": [ "## Downloading Data\n", "\n", - "This example uses the Higgs Boson [dataset](https://archive.ics.uci.edu/ml/datasets/HIGGS) from the UCI Machine Learning Repositoru." + "This example uses the Higgs Boson [dataset](https://archive.ics.uci.edu/ml/datasets/HIGGS) from the UC Irvine Machine Learning Repository." ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ + "import os\n", + "\n", "import pandas as pd\n", "import numpy as np\n", "from sklearn.model_selection import train_test_split\n", @@ -34,28 +36,33 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# This is a 2.7 GB file.\n", - "# Please make sure you have space before uncommenting the code below and downloading this file.\n", + "# Please make sure you have enough space available before\n", + "# uncommenting the code below and downloading this file.\n", "\n", - "if not os.path.isfile(\"HIGGS.csv.gz\"):\n", - " !wget https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz" + "DATA_DIRECTORY = \"./\"\n", + "DATASET_PATH = os.path.join(DATA_DIRECTORY, \"HIGGS.csv.gz\")\n", + "\n", + "# if not os.path.isfile(DATASET_PATH):\n", + "# !wget https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ - "# This fuction is borrowed from https://github.com/NVIDIA/gbm-bench/blob/master/datasets.py\n", + "# This fuction is borrowed and adapted from\n", + "# https://github.com/NVIDIA/gbm-bench/blob/master/datasets.py\n", "# Thanks!\n", "\n", - "def prepare_higgs(dataset_folder, nrows=None):\n", - " higgs = pd.read_csv(\"HIGGS.csv.gz\", nrows=nrows)\n", + "def prepare_higgs(nrows=None):\n", + " higgs = pd.read_csv(DATASET_PATH, nrows=nrows)\n", " X = higgs.iloc[:, 1:].to_numpy(dtype=np.float32)\n", " y = higgs.iloc[:, 0].to_numpy(dtype=np.int64)\n", " return train_test_split(X, y, stratify=y, random_state=77, test_size=0.2)" @@ -74,19 +81,19 @@ "source": [ "In the interest of time, we'll only use a 500,000 row sample of this file. 500,000 rows is more than enough for this example.\n", "\n", - "With the example configuration below (10 generations, population size of 10, two-fold cross validation), the `TPOT cuML` configuration provides a significant speedup.\n", + "With the example configuration below (10 generations, population size of 10, two-fold cross validation), the `TPOT cuML` configuration provides a significant speedup while achieving essentially equivalent accuracy.\n", "\n", - "Such speedups also mean you can create larger evolutionary search strategies while **still** returning faster results." + "Such speedups also mean you can create larger evolutionary search strategies while **still** obtaining faster results." ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "NROWS = 500_000\n", - "X_train, X_test, y_train, y_test = prepare_higgs(\"./\", nrows=NROWS)" + "X_train, X_test, y_train, y_test = prepare_higgs(nrows=NROWS)" ] }, { @@ -98,7 +105,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -121,29 +128,29 @@ "text": [ "\n", "Generation 1 - Current best internal CV score: 0.7103025000000001\n", - "Generation 2 - Current best internal CV score: 0.71385\n", + "Generation 2 - Current best internal CV score: 0.7103025000000001\n", "Generation 3 - Current best internal CV score: 0.725755\n", - "Generation 4 - Current best internal CV score: 0.7299725\n", - "Generation 5 - Current best internal CV score: 0.7299725\n", - "Generation 6 - Current best internal CV score: 0.7299725\n", - "Generation 7 - Current best internal CV score: 0.7309975\n", - "Generation 8 - Current best internal CV score: 0.7309975\n", - "Generation 9 - Current best internal CV score: 0.7309975\n", - "Generation 10 - Current best internal CV score: 0.7309975\n", - "Best pipeline: XGBClassifier(ZeroCount(input_matrix), alpha=1, learning_rate=0.1, max_depth=6, min_child_weight=13, n_estimators=100, nthread=1, subsample=0.8500000000000001, tree_method=gpu_hist)\n", - "CPU times: user 4min 59s, sys: 13min 27s, total: 18min 27s\n", - "Wall time: 18min 29s\n" + "Generation 4 - Current best internal CV score: 0.727995\n", + "Generation 5 - Current best internal CV score: 0.727995\n", + "Generation 6 - Current best internal CV score: 0.730315\n", + "Generation 7 - Current best internal CV score: 0.730315\n", + "Generation 8 - Current best internal CV score: 0.730315\n", + "Generation 9 - Current best internal CV score: 0.7308699999999999\n", + "Generation 10 - Current best internal CV score: 0.7347775\n", + "Best pipeline: XGBClassifier(input_matrix, alpha=1, learning_rate=0.1, max_depth=8, min_child_weight=19, n_estimators=100, nthread=1, subsample=0.8, tree_method=gpu_hist)\n", + "CPU times: user 5min 34s, sys: 1min 16s, total: 6min 50s\n", + "Wall time: 6min 52s\n" ] }, { "data": { "text/plain": [ "TPOTClassifier(config_dict='TPOT cuML', cv=2, generations=10,\n", - " log_file=,\n", + " log_file=,\n", " population_size=10, random_state=12, verbosity=2)" ] }, - "execution_count": 9, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -172,16 +179,16 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "0.7308499813079834\n", - "CPU times: user 565 ms, sys: 5.52 ms, total: 570 ms\n", - "Wall time: 569 ms\n" + "0.73669\n", + "CPU times: user 816 ms, sys: 39.9 ms, total: 856 ms\n", + "Wall time: 855 ms\n" ] } ], diff --git a/tutorials/cuML_Classification_Example.ipynb b/tutorials/cuML_Classification_Example.ipynb index 02072e32..23761120 100644 --- a/tutorials/cuML_Classification_Example.ipynb +++ b/tutorials/cuML_Classification_Example.ipynb @@ -4,23 +4,24 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This notebook walks through a basic example of using [RAPIDS](https://rapids.ai/) cuML estimators with TPOT for classification tasks. You must have access to an NVIDIA GPU and have cuML installed in your environment. Running this notebook without cuML, will cause TPOT to raise a `ValueError` indicating you should install cuML." + "This notebook walks through a basic example of using the GPU-accelerated estimators from [RAPIDS](https://rapids.ai/) cuML and [DMLC/XGBoost](https://github.com/dmlc/xgboost) with TPOT for classification tasks. You must have access to an NVIDIA GPU and have cuML installed in your environment. Running this notebook without cuML will cause TPOT to raise a `ValueError`, indicating you should install cuML." ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from tpot import TPOTClassifier\n", "from sklearn.datasets import make_classification\n", - "from sklearn.model_selection import train_test_split" + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -34,7 +35,7 @@ " n_features=NFEATURES,\n", " n_informative=NFEATURES,\n", " n_redundant=0,\n", - " class_sep=0.75,\n", + " class_sep=0.55,\n", " n_classes=2,\n", " random_state=SEED,\n", " \n", @@ -54,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -65,7 +66,7 @@ "version_minor": 0 }, "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=60.0, style=ProgressStyle(des…" + "HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=30.0, style=ProgressStyle(des…" ] }, "metadata": {}, @@ -76,21 +77,21 @@ "output_type": "stream", "text": [ "\n", - "Generation 1 - Current best internal CV score: 0.9822666666666667\n", - "Generation 2 - Current best internal CV score: 0.9822666666666667\n", - "Generation 3 - Current best internal CV score: 0.9822666666666667\n", - "Generation 4 - Current best internal CV score: 0.9840800000000001\n", - "Generation 5 - Current best internal CV score: 0.9840800000000001\n", - "Best pipeline: SVC(OneHotEncoder(FastICA(input_matrix, tol=0.9500000000000001), minimum_fraction=0.15, sparse=False, threshold=10), C=5.0, tol=0.001)\n", - "0.98584\n" + "Generation 1 - Current best internal CV score: 0.9695733333333334\n", + "Generation 2 - Current best internal CV score: 0.9695733333333334\n", + "Generation 3 - Current best internal CV score: 0.9695733333333334\n", + "Generation 4 - Current best internal CV score: 0.9705333333333334\n", + "Generation 5 - Current best internal CV score: 0.9705333333333334\n", + "Best pipeline: KNeighborsClassifier(input_matrix, n_neighbors=20, weights=uniform)\n", + "0.97704\n" ] } ], "source": [ "# TPOT setup\n", "GENERATIONS = 5\n", - "POP_SIZE = 10\n", - "CV = 3\n", + "POP_SIZE = 5\n", + "CV = 2\n", "\n", "tpot = TPOTClassifier(\n", " generations=GENERATIONS,\n", @@ -103,7 +104,9 @@ ")\n", "\n", "tpot.fit(X_train, y_train)\n", - "print(tpot.score(X_test, y_test))" + "\n", + "preds = tpot.predict(X_test)\n", + "print(accuracy_score(y_test, preds))" ] }, { @@ -117,12 +120,8 @@ "text": [ "import numpy as np\n", "import pandas as pd\n", - "from cuml.svm import SVC\n", - "from sklearn.decomposition import FastICA\n", + "from cuml.neighbors import KNeighborsClassifier\n", "from sklearn.model_selection import train_test_split\n", - "from sklearn.pipeline import make_pipeline\n", - "from tpot.builtins import OneHotEncoder\n", - "from tpot.export_utils import set_param_recursive\n", "\n", "# NOTE: Make sure that the outcome column is labeled 'target' in the data file\n", "tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)\n", @@ -130,14 +129,11 @@ "training_features, testing_features, training_target, testing_target = \\\n", " train_test_split(features, tpot_data['target'], random_state=12)\n", "\n", - "# Average CV score on the training set was: 0.9840800000000001\n", - "exported_pipeline = make_pipeline(\n", - " FastICA(tol=0.9500000000000001),\n", - " OneHotEncoder(minimum_fraction=0.15, sparse=False, threshold=10),\n", - " SVC(C=5.0, tol=0.001)\n", - ")\n", - "# Fix random state for all the steps in exported pipeline\n", - "set_param_recursive(exported_pipeline.steps, 'random_state', 12)\n", + "# Average CV score on the training set was: 0.9705333333333334\n", + "exported_pipeline = KNeighborsClassifier(n_neighbors=20, weights=\"uniform\")\n", + "# Fix random state in exported estimator\n", + "if hasattr(exported_pipeline, 'random_state'):\n", + " setattr(exported_pipeline, 'random_state', 12)\n", "\n", "exported_pipeline.fit(training_features, training_target)\n", "results = exported_pipeline.predict(testing_features)\n", @@ -149,13 +145,6 @@ "tpot.export('tpot_classification_cuml_pipeline.py')\n", "print(tpot.export())" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/tutorials/cuML_Regression_Example.ipynb b/tutorials/cuML_Regression_Example.ipynb index c6b49843..a28bfa2c 100644 --- a/tutorials/cuML_Regression_Example.ipynb +++ b/tutorials/cuML_Regression_Example.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This notebook walks through a basic example of using [RAPIDS](https://rapids.ai/) cuML estimators with TPOT for regression problems. You must have access to an NVIDIA GPU and have cuML installed in your environment. Running this notebook without cuML, will cause TPOT to raise a `ValueError` indicating you should install cuML." + "This notebook walks through a basic example of using the GPU-accelerated estimators from [RAPIDS](https://rapids.ai/) cuML and [DMLC/XGBoost](https://github.com/dmlc/xgboost) with TPOT for classification tasks. You must have access to an NVIDIA GPU and have cuML installed in your environment. Running this notebook without cuML will cause TPOT to raise a `ValueError`, indicating you should install cuML." ] }, { @@ -15,7 +15,8 @@ "source": [ "from tpot import TPOTRegressor\n", "from sklearn.datasets import make_regression\n", - "from sklearn.model_selection import train_test_split" + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import r2_score" ] }, { @@ -25,7 +26,7 @@ "outputs": [], "source": [ "NSAMPLES = 50000\n", - "NFEATURES = 20\n", + "NFEATURES = 50\n", "SEED = 12\n", "\n", "# For cuML with TPOT, you must use CPU data (such as NumPy arrays)\n", @@ -62,7 +63,7 @@ "version_minor": 0 }, "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=60.0, style=ProgressStyle(des…" + "HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=30.0, style=ProgressStyle(des…" ] }, "metadata": {}, @@ -73,21 +74,21 @@ "output_type": "stream", "text": [ "\n", - "Generation 1 - Current best internal CV score: -39782.747480986516\n", - "Generation 2 - Current best internal CV score: -39782.747480986516\n", - "Generation 3 - Current best internal CV score: -39782.74741899271\n", - "Generation 4 - Current best internal CV score: -39782.74728636739\n", - "Generation 5 - Current best internal CV score: -39782.73498449405\n", - "Best pipeline: Ridge(ElasticNet(input_matrix, l1_ratio=0.45, tol=0.1))\n", - "-40365.10253091067\n" + "Generation 1 - Current best internal CV score: -40245.878012401336\n", + "Generation 2 - Current best internal CV score: -40245.878012401336\n", + "Generation 3 - Current best internal CV score: -40245.878012401336\n", + "Generation 4 - Current best internal CV score: -40245.87130877891\n", + "Generation 5 - Current best internal CV score: -40245.87130877891\n", + "Best pipeline: Ridge(RobustScaler(input_matrix))\n", + "0.8281615479382644\n" ] } ], "source": [ "# TPOT setup\n", "GENERATIONS = 5\n", - "POP_SIZE = 10\n", - "CV = 3\n", + "POP_SIZE = 5\n", + "CV = 2\n", "\n", "tpot = TPOTRegressor(\n", " generations=GENERATIONS,\n", @@ -100,7 +101,9 @@ ")\n", "\n", "tpot.fit(X_train, y_train)\n", - "print(tpot.score(X_test, y_test))" + "\n", + "preds = tpot.predict(X_test)\n", + "print(r2_score(y_test, preds))" ] }, { @@ -114,10 +117,10 @@ "text": [ "import numpy as np\n", "import pandas as pd\n", - "from cuml.linear_model import ElasticNet, Ridge\n", + "from cuml.linear_model import Ridge\n", "from sklearn.model_selection import train_test_split\n", - "from sklearn.pipeline import make_pipeline, make_union\n", - "from tpot.builtins import StackingEstimator\n", + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.preprocessing import RobustScaler\n", "from tpot.export_utils import set_param_recursive\n", "\n", "# NOTE: Make sure that the outcome column is labeled 'target' in the data file\n", @@ -126,9 +129,9 @@ "training_features, testing_features, training_target, testing_target = \\\n", " train_test_split(features, tpot_data['target'], random_state=12)\n", "\n", - "# Average CV score on the training set was: -39782.73498449405\n", + "# Average CV score on the training set was: -40245.87130877891\n", "exported_pipeline = make_pipeline(\n", - " StackingEstimator(estimator=ElasticNet(l1_ratio=0.45, tol=0.1)),\n", + " RobustScaler(),\n", " Ridge()\n", ")\n", "# Fix random state for all the steps in exported pipeline\n", From 4b6f2a1acb52209ef2e43c4c044dcdaf78af650e Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Wed, 26 Aug 2020 17:15:04 -0400 Subject: [PATCH 24/39] update using docs --- docs_sources/using.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs_sources/using.md b/docs_sources/using.md index a4e1bcf2..f55c9939 100644 --- a/docs_sources/using.md +++ b/docs_sources/using.md @@ -459,9 +459,9 @@ Currently only classification is supported, but future releases will include reg TPOT-cuML -TPOT will search over a restricted configuration using the GPU-accelerated estimators in RAPIDS cuML. This configuration requires an NVIDIA Pascal architecture or better GPU with compute capability 6.0+, and that the library cuML is installed. With this configuration, all model training and predicting will be GPU-accelerated. +TPOT will search over a restricted configuration using the GPU-accelerated estimators in RAPIDS cuML and DMLC XGBoost. This configuration requires an NVIDIA Pascal architecture or better GPU with compute capability 6.0+, and that the library cuML is installed. With this configuration, all model training and predicting will be GPU-accelerated.

-This configuration is useful for medium-sized and larger datasets on which CPU-based estimators are a common bottleneck, and works for both the TPOTClassifier and TPOTRegressor. +This configuration is particularly useful for medium-sized and larger datasets on which CPU-based estimators are a common bottleneck, and works for both the TPOTClassifier and TPOTRegressor. Classification

Regression From c2865642ef50e6c446a66b31ddd42861ad386a26 Mon Sep 17 00:00:00 2001 From: jorijnsmit Date: Sat, 29 Aug 2020 11:11:20 +0200 Subject: [PATCH 25/39] ignore RuntimeWarning thrown by np.nanmean when all scores are np.nan --- tpot/gp_deap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpot/gp_deap.py b/tpot/gp_deap.py index ae1afab7..f50574a8 100644 --- a/tpot/gp_deap.py +++ b/tpot/gp_deap.py @@ -447,7 +447,7 @@ def _wrapped_cross_val_score(sklearn_pipeline, features, target, else: try: with warnings.catch_warnings(): - warnings.simplefilter('ignore') + warnings.simplefilter('ignore', category=RuntimeWarning) scores = [_fit_and_score(estimator=clone(sklearn_pipeline), X=features, y=target, From e481cacfdc0f3077fb3025421ffa2738b44957cf Mon Sep 17 00:00:00 2001 From: jorijnsmit Date: Sat, 29 Aug 2020 11:29:40 +0200 Subject: [PATCH 26/39] np.nanmean warning was not suppressed due to indentation --- tpot/gp_deap.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tpot/gp_deap.py b/tpot/gp_deap.py index f50574a8..a1d4f7e8 100644 --- a/tpot/gp_deap.py +++ b/tpot/gp_deap.py @@ -447,7 +447,7 @@ def _wrapped_cross_val_score(sklearn_pipeline, features, target, else: try: with warnings.catch_warnings(): - warnings.simplefilter('ignore', category=RuntimeWarning) + warnings.simplefilter('ignore') scores = [_fit_and_score(estimator=clone(sklearn_pipeline), X=features, y=target, @@ -459,8 +459,9 @@ def _wrapped_cross_val_score(sklearn_pipeline, features, target, error_score='raise', fit_params=sample_weight_dict) for train, test in cv_iter] - CV_score = np.array(scores)[:, 0] - return np.nanmean(CV_score) + CV_score = np.array(scores)[:, 0] + CV_score_mean = np.nanmean(CV_score) + return CV_score_mean except TimeoutException: return "Timeout" except Exception as e: From ca2231975e3d3725e35abe2038c0ea48c5fef96c Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Mon, 31 Aug 2020 09:28:55 -0700 Subject: [PATCH 27/39] cleanup Higgs notebook --- tutorials/Higgs_Boson.ipynb | 70 ++++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 20 deletions(-) diff --git a/tutorials/Higgs_Boson.ipynb b/tutorials/Higgs_Boson.ipynb index e0e4122a..003541e6 100644 --- a/tutorials/Higgs_Boson.ipynb +++ b/tutorials/Higgs_Boson.ipynb @@ -6,7 +6,7 @@ "source": [ "This notebook walks through a basic example of using the GPU-accelerated estimators from [RAPIDS](https://rapids.ai/) cuML and [DMLC/XGBoost](https://github.com/dmlc/xgboost) with TPOT for classification tasks. You must have access to an NVIDIA GPU and have cuML installed in your environment. Running this notebook without cuML will cause TPOT to raise a `ValueError`, indicating you should install cuML.\n", "\n", - "It is intended to show how the `TPOT cuML` configuration can provide significant benefits on medium-sized and larger datasets. " + "It is intended to show how the `TPOT cuML` configuration can provide significant performance benefits on medium-sized and larger datasets. " ] }, { @@ -20,7 +20,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -36,9 +36,27 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2020-08-31 09:16:09-- https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz\n", + "Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252\n", + "Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 2816407858 (2.6G) [application/x-httpd-php]\n", + "Saving to: ‘./HIGGS.csv.gz’\n", + "\n", + "HIGGS.csv.gz 100%[===================>] 2.62G 102MB/s in 29s \n", + "\n", + "2020-08-31 09:16:38 (93.5 MB/s) - ‘./HIGGS.csv.gz’ saved [2816407858/2816407858]\n", + "\n" + ] + } + ], "source": [ "# This is a 2.7 GB file.\n", "# Please make sure you have enough space available before\n", @@ -48,12 +66,12 @@ "DATASET_PATH = os.path.join(DATA_DIRECTORY, \"HIGGS.csv.gz\")\n", "\n", "# if not os.path.isfile(DATASET_PATH):\n", - "# !wget https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz" + "# !wget https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz -P {DATA_DIRECTORY}" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -79,16 +97,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In the interest of time, we'll only use a 500,000 row sample of this file. 500,000 rows is more than enough for this example.\n", - "\n", - "With the example configuration below (10 generations, population size of 10, two-fold cross validation), the `TPOT cuML` configuration provides a significant speedup while achieving essentially equivalent accuracy.\n", - "\n", - "Such speedups also mean you can create larger evolutionary search strategies while **still** obtaining faster results." + "In the interest of time, we'll only use a 500,000 row sample of this file. 500,000 rows is more than enough for this example." ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -105,7 +119,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -138,19 +152,19 @@ "Generation 9 - Current best internal CV score: 0.7308699999999999\n", "Generation 10 - Current best internal CV score: 0.7347775\n", "Best pipeline: XGBClassifier(input_matrix, alpha=1, learning_rate=0.1, max_depth=8, min_child_weight=19, n_estimators=100, nthread=1, subsample=0.8, tree_method=gpu_hist)\n", - "CPU times: user 5min 34s, sys: 1min 16s, total: 6min 50s\n", - "Wall time: 6min 52s\n" + "CPU times: user 5min 19s, sys: 54.7 s, total: 6min 14s\n", + "Wall time: 6min 17s\n" ] }, { "data": { "text/plain": [ "TPOTClassifier(config_dict='TPOT cuML', cv=2, generations=10,\n", - " log_file=,\n", + " log_file=,\n", " population_size=10, random_state=12, verbosity=2)" ] }, - "execution_count": 14, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -179,7 +193,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -187,8 +201,8 @@ "output_type": "stream", "text": [ "0.73669\n", - "CPU times: user 816 ms, sys: 39.9 ms, total: 856 ms\n", - "Wall time: 855 ms\n" + "CPU times: user 770 ms, sys: 31.7 ms, total: 802 ms\n", + "Wall time: 801 ms\n" ] } ], @@ -288,6 +302,22 @@ "preds = tpot.predict(X_test)\n", "print(accuracy_score(y_test, preds))" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Performance Comparison\n", + "With the example configuration above (10 generations, population size of 10, two-fold cross validation), the `TPOT cuML` configuration provides a significant speedup while achieving essentially equivalent accuracy.\n", + "\n", + "The GPU-accelerated version achieves an out-of-sample accuracy of 73.7% in **seven minutes**, while the default version achieves an accuracy of 73.8% after more than **five hours**. This kind of speedup also means you can create larger evolutionary search strategies while **still** obtaining faster results.\n", + "\n", + "### Hardware\n", + "The following hardware was used for this test. Results and speedups will vary.\n", + "\n", + "- CPU: 2x Intel(R) Xeon(R) Platinum 8168 CPU @ 2.70GHz (24 cores)\n", + "- GPU: 1x NVIDIA V100 32GB" + ] } ], "metadata": { From a45046df22c92ab35e0bee71ff3a67326995fba7 Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Thu, 3 Sep 2020 14:45:48 -0700 Subject: [PATCH 28/39] update regressor/clasifier configs --- tpot/config/classifier_cuml.py | 9 --------- tpot/config/regressor_cuml.py | 9 --------- 2 files changed, 18 deletions(-) diff --git a/tpot/config/classifier_cuml.py b/tpot/config/classifier_cuml.py index 010a058f..13b1c26a 100644 --- a/tpot/config/classifier_cuml.py +++ b/tpot/config/classifier_cuml.py @@ -37,15 +37,6 @@ "weights": ["uniform",], }, - "cuml.ensemble.RandomForestClassifier": { - "n_estimators": [100, 300], - "split_algo": [0, 1], - "max_depth": range(8, 16), - "max_features": np.arange(0.05, 1.01, 0.05), - "min_rows_per_node": range(2, 21), - "n_bins": [64,] - }, - "cuml.linear_model.LogisticRegression": { "penalty": ["l1", "l2", "elasticnet"], "C": [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.,], diff --git a/tpot/config/regressor_cuml.py b/tpot/config/regressor_cuml.py index 3d61720c..87749611 100644 --- a/tpot/config/regressor_cuml.py +++ b/tpot/config/regressor_cuml.py @@ -46,15 +46,6 @@ "normalize": [True, False] }, - "cuml.ensemble.RandomForestRegressor": { - "n_estimators": [100, 300, 500,], - "split_algo": [0, 1,], - "max_depth": range(8, 20), - "max_features": np.arange(0.05, 1.01, 0.05), - "min_rows_per_node": range(2, 21), - "n_bins": [64,] - }, - "cuml.linear_model.Ridge": { }, From 3a749076d9885823f005339ac24e9cc177340216 Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Thu, 3 Sep 2020 15:03:48 -0700 Subject: [PATCH 29/39] update example higgs notebook for 0.16 --- tutorials/Higgs_Boson.ipynb | 70 ++++++++++++++----------------------- 1 file changed, 26 insertions(+), 44 deletions(-) diff --git a/tutorials/Higgs_Boson.ipynb b/tutorials/Higgs_Boson.ipynb index 003541e6..9dbd6bed 100644 --- a/tutorials/Higgs_Boson.ipynb +++ b/tutorials/Higgs_Boson.ipynb @@ -38,25 +38,7 @@ "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--2020-08-31 09:16:09-- https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz\n", - "Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252\n", - "Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 2816407858 (2.6G) [application/x-httpd-php]\n", - "Saving to: ‘./HIGGS.csv.gz’\n", - "\n", - "HIGGS.csv.gz 100%[===================>] 2.62G 102MB/s in 29s \n", - "\n", - "2020-08-31 09:16:38 (93.5 MB/s) - ‘./HIGGS.csv.gz’ saved [2816407858/2816407858]\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "# This is a 2.7 GB file.\n", "# Please make sure you have enough space available before\n", @@ -71,7 +53,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -102,7 +84,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -119,7 +101,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -141,30 +123,30 @@ "output_type": "stream", "text": [ "\n", - "Generation 1 - Current best internal CV score: 0.7103025000000001\n", - "Generation 2 - Current best internal CV score: 0.7103025000000001\n", - "Generation 3 - Current best internal CV score: 0.725755\n", - "Generation 4 - Current best internal CV score: 0.727995\n", - "Generation 5 - Current best internal CV score: 0.727995\n", - "Generation 6 - Current best internal CV score: 0.730315\n", - "Generation 7 - Current best internal CV score: 0.730315\n", - "Generation 8 - Current best internal CV score: 0.730315\n", - "Generation 9 - Current best internal CV score: 0.7308699999999999\n", - "Generation 10 - Current best internal CV score: 0.7347775\n", - "Best pipeline: XGBClassifier(input_matrix, alpha=1, learning_rate=0.1, max_depth=8, min_child_weight=19, n_estimators=100, nthread=1, subsample=0.8, tree_method=gpu_hist)\n", - "CPU times: user 5min 19s, sys: 54.7 s, total: 6min 14s\n", - "Wall time: 6min 17s\n" + "Generation 1 - Current best internal CV score: 0.730335\n", + "Generation 2 - Current best internal CV score: 0.730335\n", + "Generation 3 - Current best internal CV score: 0.730335\n", + "Generation 4 - Current best internal CV score: 0.735615\n", + "Generation 5 - Current best internal CV score: 0.7359375\n", + "Generation 6 - Current best internal CV score: 0.7359375\n", + "Generation 7 - Current best internal CV score: 0.7359375\n", + "Generation 8 - Current best internal CV score: 0.7359375\n", + "Generation 9 - Current best internal CV score: 0.736115\n", + "Generation 10 - Current best internal CV score: 0.7361850000000001\n", + "Best pipeline: XGBClassifier(ZeroCount(SelectPercentile(ZeroCount(input_matrix), percentile=99)), alpha=1, learning_rate=0.1, max_depth=9, min_child_weight=11, n_estimators=100, nthread=1, subsample=0.7000000000000001, tree_method=gpu_hist)\n", + "CPU times: user 8min 15s, sys: 1min 17s, total: 9min 33s\n", + "Wall time: 9min 39s\n" ] }, { "data": { "text/plain": [ "TPOTClassifier(config_dict='TPOT cuML', cv=2, generations=10,\n", - " log_file=,\n", + " log_file=,\n", " population_size=10, random_state=12, verbosity=2)" ] }, - "execution_count": 10, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -193,16 +175,16 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "0.73669\n", - "CPU times: user 770 ms, sys: 31.7 ms, total: 802 ms\n", - "Wall time: 801 ms\n" + "0.73853\n", + "CPU times: user 950 ms, sys: 36.2 ms, total: 986 ms\n", + "Wall time: 984 ms\n" ] } ], @@ -308,12 +290,12 @@ "metadata": {}, "source": [ "## Performance Comparison\n", - "With the example configuration above (10 generations, population size of 10, two-fold cross validation), the `TPOT cuML` configuration provides a significant speedup while achieving essentially equivalent accuracy.\n", + "With the example configuration above (10 generations, population size of 10, two-fold cross validation), the `TPOT cuML` configuration provided a significant speedup while achieving essentially equivalent accuracy.\n", "\n", - "The GPU-accelerated version achieves an out-of-sample accuracy of 73.7% in **seven minutes**, while the default version achieves an accuracy of 73.8% after more than **five hours**. This kind of speedup also means you can create larger evolutionary search strategies while **still** obtaining faster results.\n", + "The GPU-accelerated version achieved an out-of-sample accuracy of 73.85% in **fewer than 10 minutes**, while the default version achieved an accuracy of 73.79% after more than **five hours** (specific performance values will vary across runs). This kind of speedup also means you can create larger evolutionary search strategies while **still** obtaining faster results.\n", "\n", "### Hardware\n", - "The following hardware was used for this test. Results and speedups will vary.\n", + "The following hardware was used for this test. Results and speedups will vary across systems and configurations.\n", "\n", "- CPU: 2x Intel(R) Xeon(R) Platinum 8168 CPU @ 2.70GHz (24 cores)\n", "- GPU: 1x NVIDIA V100 32GB" From 46475bcfe48988a9e2f5da89bab570d51af761ff Mon Sep 17 00:00:00 2001 From: Andreas Fehlner Date: Sat, 3 Oct 2020 22:48:48 +0200 Subject: [PATCH 30/39] #DC spelling --- README.md | 2 +- docs_sources/api.md | 2 +- docs_sources/releases.md | 2 +- docs_sources/using.md | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index c741552a..15d50f07 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ Click on the corresponding links to find more information on TPOT usage in the d ### Classification -Below is a minimal working example with the the optical recognition of handwritten digits dataset. +Below is a minimal working example with the optical recognition of handwritten digits dataset. ```python from tpot import TPOTClassifier diff --git a/docs_sources/api.md b/docs_sources/api.md index ea1fe7bf..e2a97bd2 100644 --- a/docs_sources/api.md +++ b/docs_sources/api.md @@ -227,7 +227,7 @@ Flag indicating whether the TPOT version checker should be disabled. The update checker will tell you when a new version of TPOT has been released. -log_file: io.TextIOWrapper or io.StringIO, optional (defaul: sys.stdout) +log_file: io.TextIOWrapper or io.StringIO, optional (default: sys.stdout)

Save progress content to a file. diff --git a/docs_sources/releases.md b/docs_sources/releases.md index 13e892bd..724b90af 100644 --- a/docs_sources/releases.md +++ b/docs_sources/releases.md @@ -75,7 +75,7 @@ - We refined parameters in VarianceThreshold and FeatureAgglomeration. -- TPOT now supports using memory caching within a Pipeline via a optional `memory` parameter. +- TPOT now supports using memory caching within a Pipeline via an optional `memory` parameter. - We improved documentation of TPOT. diff --git a/docs_sources/using.md b/docs_sources/using.md index e2687a90..f4252791 100644 --- a/docs_sources/using.md +++ b/docs_sources/using.md @@ -566,7 +566,7 @@ If a specific operator, e.g. `SelectPercentile`, is preferred for usage in the 1 ## FeatureSetSelector in TPOT -`FeatureSetSelector` is a special new operator in TPOT. This operator enables feature selection based on *priori* expert knowledge. For example, in RNA-seq gene expression analysis, this operator can be used to select one or more gene (feature) set(s) based on GO (Gene Ontology) terms or annotated gene sets Molecular Signatures Database ([MSigDB](http://software.broadinstitute.org/gsea/msigdb/index.jsp)) in the 1st step of pipeline via `template` option above, in order to reduce dimensions and TPOT computation time. This operator requires a dataset list in csv format. In this csv file, there are only three columns: 1st column is feature set names, 2nd column is the total number of features in one set and 3rd column is a list of feature names (if input X is pandas.DataFrame) or indexes (if input X is numpy.ndarray) delimited by ";". Below is a example how to use this operator in TPOT. +`FeatureSetSelector` is a special new operator in TPOT. This operator enables feature selection based on *priori* expert knowledge. For example, in RNA-seq gene expression analysis, this operator can be used to select one or more gene (feature) set(s) based on GO (Gene Ontology) terms or annotated gene sets Molecular Signatures Database ([MSigDB](http://software.broadinstitute.org/gsea/msigdb/index.jsp)) in the 1st step of pipeline via `template` option above, in order to reduce dimensions and TPOT computation time. This operator requires a dataset list in csv format. In this csv file, there are only three columns: 1st column is feature set names, 2nd column is the total number of features in one set and 3rd column is a list of feature names (if input X is pandas.DataFrame) or indexes (if input X is numpy.ndarray) delimited by ";". Below is an example how to use this operator in TPOT. Please check our [preprint paper](https://www.biorxiv.org/content/10.1101/502484v1.article-info) for more details. @@ -655,7 +655,7 @@ To use your Dask cluster to fit a TPOT model, specify the ``use_dask`` keyword w estimator = TPOTEstimator(use_dask=True, n_jobs=-1) ``` -This will use use all the workers on your cluster to do the training, and use [Dask-ML's pipeline rewriting](https://dask-ml.readthedocs.io/en/latest/hyper-parameter-search.html#avoid-repeated-work) to avoid re-fitting estimators multiple times on the same set of data. +This will use all the workers on your cluster to do the training, and use [Dask-ML's pipeline rewriting](https://dask-ml.readthedocs.io/en/latest/hyper-parameter-search.html#avoid-repeated-work) to avoid re-fitting estimators multiple times on the same set of data. It will also provide fine-grained diagnostics in the [distributed scheduler UI](https://distributed.readthedocs.io/en/latest/web.html). Alternatively, Dask implements a joblib backend. From 5865228a0c4d2e095d519245b1c10a491db63fdb Mon Sep 17 00:00:00 2001 From: Andreas Fehlner Date: Sat, 3 Oct 2020 23:08:32 +0200 Subject: [PATCH 31/39] #DC spelling --- tpot/base.py | 6 +++--- tpot/config/regressor.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tpot/base.py b/tpot/base.py index 69361799..748253c9 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -1619,7 +1619,7 @@ def _operator_count(self, individual): ---------- individual: list A grown tree with leaves at possibly different depths - dependending on the condition function. + depending on the condition function. Returns ------- @@ -1670,7 +1670,7 @@ def _generate(self, pset, min_, max_, condition, type_=None): min_: int Minimum height of the produced trees. max_: int - Maximum Height of the produced trees. + Maximum height of the produced trees. condition: function The condition is a function that takes two arguments, the height of the tree to build and the current @@ -1683,7 +1683,7 @@ def _generate(self, pset, min_, max_, condition, type_=None): ------- individual: list A grown tree with leaves at possibly different depths - dependending on the condition function. + depending on the condition function. """ if type_ is None: type_ = pset.ret diff --git a/tpot/config/regressor.py b/tpot/config/regressor.py index 33ec7478..56849056 100644 --- a/tpot/config/regressor.py +++ b/tpot/config/regressor.py @@ -116,7 +116,7 @@ 'power_t': [0.5, 0.0, 1.0, 0.1, 100.0, 10.0, 50.0] }, - # Preprocesssors + # Preprocessors 'sklearn.preprocessing.Binarizer': { 'threshold': np.arange(0.0, 1.01, 0.05) }, From d955f2a1ba7c3512f737163281635916f9ba9c44 Mon Sep 17 00:00:00 2001 From: Weixuan Date: Mon, 5 Oct 2020 13:11:34 -0400 Subject: [PATCH 32/39] ret check error --- tpot/gp_deap.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tpot/gp_deap.py b/tpot/gp_deap.py index 1dc74067..c30894b2 100644 --- a/tpot/gp_deap.py +++ b/tpot/gp_deap.py @@ -341,7 +341,7 @@ def mutNodeReplacement(individual, pset): rindex = None if index + 1 < len(individual): for i, tmpnode in enumerate(individual[index + 1:], index + 1): - if isinstance(tmpnode, gp.Primitive) and tmpnode.ret in tmpnode.args: + if isinstance(tmpnode, gp.Primitive) and tmpnode.ret in node.args: rindex = i break @@ -349,6 +349,7 @@ def mutNodeReplacement(individual, pset): # for example: if op.root is True then the node.ret is Output_DF object # based on the function _setup_pset. Then primitives is the list of classifor or regressor primitives = pset.primitives[node.ret] + if len(primitives) != 0: new_node = np.random.choice(primitives) new_subtree = [None] * len(new_node.args) @@ -371,6 +372,7 @@ def mutNodeReplacement(individual, pset): # combine with primitives new_subtree.insert(0, new_node) individual[slice_] = new_subtree + return individual, From c3dbca5c13c217ce454576c4e41bdb8ac4d62bb8 Mon Sep 17 00:00:00 2001 From: Weixuan Date: Mon, 5 Oct 2020 15:07:59 -0400 Subject: [PATCH 33/39] Add string path support for log/log_file parameter #1114 --- docs_sources/api.md | 5 ++++- tests/tpot_tests.py | 24 +++++++++++++++++++----- tpot/base.py | 22 +++++++++++++--------- 3 files changed, 36 insertions(+), 15 deletions(-) diff --git a/docs_sources/api.md b/docs_sources/api.md index e2a97bd2..13e72542 100644 --- a/docs_sources/api.md +++ b/docs_sources/api.md @@ -227,10 +227,13 @@ Flag indicating whether the TPOT version checker should be disabled. The update checker will tell you when a new version of TPOT has been released.
-log_file: io.TextIOWrapper or io.StringIO, optional (default: sys.stdout) +log_file: file-like class (io.TextIOWrapper or io.StringIO) or string, optional (default: None)

Save progress content to a file. +If it is a string for the path and file name of the desired output file, +TPOT will create the file and write log into it. +If it is None, TPOT will output log into sys.stdout
diff --git a/tests/tpot_tests.py b/tests/tpot_tests.py index 760c11f1..8c00a1f0 100644 --- a/tests/tpot_tests.py +++ b/tests/tpot_tests.py @@ -197,18 +197,32 @@ def test_init_custom_parameters(): assert tpot_obj._exported_pipeline_text == [] assert tpot_obj.log_file == sys.stdout -def test_init_custom_progress_file(): +def test_init_log_file(): """ Assert that TPOT has right file handler to save progress. """ cachedir = mkdtemp() file_name = cachedir + "/progress.log" file_handle = open(file_name, "w") tpot_obj = TPOTClassifier(log_file=file_handle) - assert tpot_obj.log_file == file_handle + tpot_obj._fit_init() + assert tpot_obj.log_file_ == file_handle file_handle.close() # clean up rmtree(cachedir) +def test_init_log_file_2(): + """ Assert that TPOT has right file handler to save progress via string input.""" + cachedir = mkdtemp() + file_name = cachedir + "/progress.log" + tpot_obj = TPOTClassifier(log_file=file_name) + tpot_obj._fit_init() + from io import TextIOWrapper + assert isinstance(tpot_obj.log_file_, TextIOWrapper) + tpot_obj.log_file_.close() + # clean up + rmtree(cachedir) + + def test_init_default_scoring(): """Assert that TPOT intitializes with the correct default scoring function.""" tpot_obj = TPOTRegressor() @@ -706,7 +720,7 @@ def test_sample_weight_func(): assert not np.allclose(cv_score1, cv_score_weight) assert np.allclose(known_score, score, rtol=0.01) - + def test_template_1(): @@ -1131,7 +1145,7 @@ def test_fit_cuml(): verbosity=0, config_dict='TPOT cuML' ) - + tpot_regr_obj = TPOTRegressor( random_state=42, population_size=1, @@ -1145,7 +1159,7 @@ def test_fit_cuml(): tpot_clf_obj.fit(training_features, training_target) assert isinstance(tpot_clf_obj._optimized_pipeline, creator.Individual) assert not (tpot_clf_obj._start_datetime is None) - + tpot_regr_obj.fit(pretest_X_reg, pretest_y_reg) assert isinstance(tpot_regr_obj._optimized_pipeline, creator.Individual) assert not (tpot_regr_obj._start_datetime is None) diff --git a/tpot/base.py b/tpot/base.py index 9a6f6b0a..b93a1306 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -242,7 +242,7 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, A setting of 2 or higher will add a progress bar during the optimization procedure. disable_update_check: bool, optional (default: False) Flag indicating whether the TPOT version checker should be disabled. - log_file: io.TextIOWrapper or io.StringIO, optional (defaul: sys.stdout) + log_file: string, io.TextIOWrapper or io.StringIO, optional (defaul: sys.stdout) Save progress content to a file. Returns @@ -589,7 +589,11 @@ def _fit_init(self): self._pbar = None if not self.log_file: - self.log_file = sys.stdout + self.log_file_ = sys.stdout + elif isinstance(self.log_file, str): + self.log_file_ = open(self.log_file, 'w') + else: + self.log_file_ = self.log_file self._setup_scoring_function(self.scoring) @@ -716,7 +720,7 @@ def pareto_eq(ind1, ind2): else: total_evals = self._lambda * self.generations + self.population_size - self._pbar = tqdm(total=total_evals, unit='pipeline', leave=False, file=self.log_file, + self._pbar = tqdm(total=total_evals, unit='pipeline', leave=False, file=self.log_file_, disable=not (self.verbosity >= 2), desc='Optimization Progress') try: @@ -735,15 +739,15 @@ def pareto_eq(ind1, ind2): halloffame=self._pareto_front, verbose=self.verbosity, per_generation_function=self._check_periodic_pipeline, - log_file=self.log_file + log_file=self.log_file_ ) # Allow for certain exceptions to signal a premature fit() cancellation except (KeyboardInterrupt, SystemExit, StopIteration) as e: if self.verbosity > 0: - self._pbar.write('', file=self.log_file) + self._pbar.write('', file=self.log_file_) self._pbar.write('{}\nTPOT closed prematurely. Will use the current best pipeline.'.format(e), - file=self.log_file) + file=self.log_file_) finally: # clean population for the next call if warm_start=False if not self.warm_start: @@ -1358,10 +1362,10 @@ def _evaluate_individuals(self, population, features, target, sample_weight=None except (KeyboardInterrupt, SystemExit, StopIteration) as e: if self.verbosity > 0: - self._pbar.write('', file=self.log_file) + self._pbar.write('', file=self.log_file_) self._pbar.write('{}\nTPOT closed during evaluation in one generation.\n' 'WARNING: TPOT may not provide a good pipeline if TPOT is stopped/interrupted in a early generation.'.format(e), - file=self.log_file) + file=self.log_file_) # number of individuals already evaluated in this generation num_eval_ind = len(result_score_list) @@ -1510,7 +1514,7 @@ def _update_pbar(self, pbar_num=1, pbar_msg=None): """ if not isinstance(self._pbar, type(None)): if self.verbosity > 2 and pbar_msg is not None: - self._pbar.write(pbar_msg, file=self.log_file) + self._pbar.write(pbar_msg, file=self.log_file_) if not self._pbar.disable: self._pbar.update(pbar_num) From f75dc40c09d7a74240421ed9b20620bb8b87167c Mon Sep 17 00:00:00 2001 From: Weixuan Date: Mon, 5 Oct 2020 15:17:12 -0400 Subject: [PATCH 34/39] fix unit tests --- tests/tpot_tests.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/tpot_tests.py b/tests/tpot_tests.py index 8c00a1f0..a3dc1567 100644 --- a/tests/tpot_tests.py +++ b/tests/tpot_tests.py @@ -195,7 +195,7 @@ def test_init_custom_parameters(): assert tpot_obj._optimized_pipeline_score == None assert tpot_obj.fitted_pipeline_ == None assert tpot_obj._exported_pipeline_text == [] - assert tpot_obj.log_file == sys.stdout + assert tpot_obj.log_file_ == sys.stdout def test_init_log_file(): """ Assert that TPOT has right file handler to save progress. """ @@ -1300,7 +1300,7 @@ def test_check_periodic_pipeline(): ) tpot_obj.fit(training_features, training_target) with closing(StringIO()) as our_file: - tpot_obj.log_file = our_file + tpot_obj.log_file_ = our_file tpot_obj.verbosity = 3 tpot_obj._last_pipeline_write = datetime.now() sleep(0.11) @@ -1344,7 +1344,7 @@ def test_save_periodic_pipeline(): ) tpot_obj.fit(training_features, training_target) with closing(StringIO()) as our_file: - tpot_obj.log_file = our_file + tpot_obj.log_file_ = our_file tpot_obj.verbosity = 3 tpot_obj._last_pipeline_write = datetime.now() sleep(0.11) @@ -1374,7 +1374,7 @@ def test_save_periodic_pipeline_2(): ) tpot_obj.fit(training_features, training_target) with closing(StringIO()) as our_file: - tpot_obj.log_file = our_file + tpot_obj.log_file_ = our_file tpot_obj.verbosity = 3 tpot_obj._last_pipeline_write = datetime.now() sleep(0.11) @@ -1405,7 +1405,7 @@ def test_check_periodic_pipeline_3(): ) tpot_obj.fit(training_features, training_target) with closing(StringIO()) as our_file: - tpot_obj.log_file = our_file + tpot_obj.log_file_ = our_file tpot_obj.verbosity = 3 tpot_obj._exported_pipeline_text = [] tpot_obj._last_pipeline_write = datetime.now() @@ -1648,7 +1648,7 @@ def test_update_pbar(): # reset verbosity = 3 for checking pbar message tpot_obj.verbosity = 3 with closing(StringIO()) as our_file: - tpot_obj.log_file=our_file + tpot_obj.log_file_ = our_file tpot_obj._pbar = tqdm(total=10, disable=False, file=our_file) tpot_obj._update_pbar(pbar_num=2, pbar_msg="Test Warning Message") our_file.seek(0) @@ -1667,7 +1667,7 @@ def test_update_val(): # reset verbosity = 3 for checking pbar message tpot_obj.verbosity = 3 with closing(StringIO()) as our_file: - tpot_obj.log_file=our_file + tpot_obj.log_file_ = our_file tpot_obj._pbar = tqdm(total=10, disable=False, file=our_file) result_score_list = [] result_score_list = tpot_obj._update_val(0.9999, result_score_list) @@ -1714,7 +1714,7 @@ def test_preprocess_individuals(): # reset verbosity = 3 for checking pbar message tpot_obj.verbosity = 3 with closing(StringIO()) as our_file: - tpot_obj.log_file=our_file + tpot_obj.log_file_ = our_file tpot_obj._pbar = tqdm(total=2, disable=False, file=our_file) operator_counts, eval_individuals_str, sklearn_pipeline_list, _ = \ tpot_obj._preprocess_individuals(individuals) @@ -1760,7 +1760,7 @@ def test_preprocess_individuals_2(): # reset verbosity = 3 for checking pbar message tpot_obj.verbosity = 3 with closing(StringIO()) as our_file: - tpot_obj.log_file=our_file + tpot_obj.log_file_ = our_file tpot_obj._pbar = tqdm(total=3, disable=False, file=our_file) operator_counts, eval_individuals_str, sklearn_pipeline_list, _ = \ tpot_obj._preprocess_individuals(individuals) @@ -1807,7 +1807,7 @@ def test_preprocess_individuals_3(): # reset verbosity = 3 for checking pbar message with closing(StringIO()) as our_file: - tpot_obj.log_file=our_file + tpot_obj.log_file_ = our_file tpot_obj._lambda=4 tpot_obj._pbar = tqdm(total=2, disable=False, file=our_file) tpot_obj._pbar.n = 2 From 32565d08da16bb308c9a4f0e8a572847bf9a8897 Mon Sep 17 00:00:00 2001 From: Weixuan Date: Thu, 15 Oct 2020 09:21:08 -0400 Subject: [PATCH 35/39] fix issue #1128 --- tpot/tpot.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tpot/tpot.py b/tpot/tpot.py index 065c790d..9841939f 100644 --- a/tpot/tpot.py +++ b/tpot/tpot.py @@ -56,7 +56,6 @@ def _init_pretest(self, features, target): random_state=self.random_state, test_size=None, train_size=min(50,int(0.9*features.shape[0])), - stratify=target ) #Make sure there is a least one example from each class #for this evaluative test sample From 4c17e371b51ee075767aae1ae101fb823d706a84 Mon Sep 17 00:00:00 2001 From: Weixuan Fu Date: Wed, 21 Oct 2020 10:45:47 -0400 Subject: [PATCH 36/39] Update installing.md --- docs_sources/installing.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs_sources/installing.md b/docs_sources/installing.md index 1c4557cb..a76b24a6 100644 --- a/docs_sources/installing.md +++ b/docs_sources/installing.md @@ -47,7 +47,7 @@ pip install xgboost If you have issues installing XGBoost, check the [XGBoost installation documentation](http://xgboost.readthedocs.io/en/latest/build.html). -If you plan to use [Dask](http://dask.pydata.org/en/latest/) for parallel training, make sure to install [dask[delay] and dask[dataframe]](https://docs.dask.org/en/latest/install.html) and [dask_ml](https://dask-ml.readthedocs.io/en/latest/install.html). +If you plan to use [Dask](http://dask.pydata.org/en/latest/) for parallel training, make sure to install [dask[delay] and dask[dataframe]](https://docs.dask.org/en/latest/install.html) and [dask_ml](https://dask-ml.readthedocs.io/en/latest/install.html). **It is noted that dask-ml>=1.7 requires distributed>=2.4.0 and scikit-learn>=0.23.0.** ```Shell pip install dask[delayed] dask[dataframe] dask-ml fsspec>=0.3.3 From 11794919d2db5100117a1fb2417ff943baca52ee Mon Sep 17 00:00:00 2001 From: Weixuan Fu Date: Mon, 26 Oct 2020 10:08:58 -0400 Subject: [PATCH 37/39] update docs --- docs_sources/installing.md | 14 +++++++++++++- docs_sources/releases.md | 24 ++++++++++++++++++++++++ docs_sources/using.md | 4 ++-- tpot-cuml.yml | 17 +++++++++++++++++ tpot/_version.py | 2 +- tpot/base.py | 14 ++++++++++++++ 6 files changed, 71 insertions(+), 4 deletions(-) create mode 100644 tpot-cuml.yml diff --git a/docs_sources/installing.md b/docs_sources/installing.md index a76b24a6..5557f853 100644 --- a/docs_sources/installing.md +++ b/docs_sources/installing.md @@ -50,7 +50,7 @@ If you have issues installing XGBoost, check the [XGBoost installation documenta If you plan to use [Dask](http://dask.pydata.org/en/latest/) for parallel training, make sure to install [dask[delay] and dask[dataframe]](https://docs.dask.org/en/latest/install.html) and [dask_ml](https://dask-ml.readthedocs.io/en/latest/install.html). **It is noted that dask-ml>=1.7 requires distributed>=2.4.0 and scikit-learn>=0.23.0.** ```Shell -pip install dask[delayed] dask[dataframe] dask-ml fsspec>=0.3.3 +pip install dask[delayed] dask[dataframe] dask-ml fsspec>=0.3.3 distributed>=2.10.0 ``` If you plan to use the [TPOT-MDR configuration](https://arxiv.org/abs/1702.01780), make sure to install [scikit-mdr](https://github.com/EpistasisLab/scikit-mdr) and [scikit-rebate](https://github.com/EpistasisLab/scikit-rebate): @@ -85,6 +85,18 @@ conda install -c conda-forge tpot xgboost dask dask-ml scikit-mdr skrebate As mentioned above, we recommend following [PyTorch's installation instructions](https://pytorch.org/get-started/locally/) for installing it to enable support for [PyTorch](https://pytorch.org/)-based neural networks (TPOT-NN). +## Installation for using TPOT-cuML configuration + +With "TPOT cuML" configuration (see built-in configurations), TPOT will search over a restricted configuration using the GPU-accelerated estimators in [RAPIDS cuML](https://github.com/rapidsai/cuml) and [DMLC XGBoost](https://github.com/dmlc/xgboost). **This configuration requires an NVIDIA Pascal architecture or better GPU with [compute capability 6.0+](https://developer.nvidia.com/cuda-gpus), and that the library cuML is installed.** With this configuration, all model training and predicting will be GPU-accelerated. This configuration is particularly useful for medium-sized and larger datasets on which CPU-based estimators are a common bottleneck, and works for both the `TPOTClassifier` and `TPOTRegressor`. + +Please download this conda environment yml file to install TPOT for using TPOT-cuML configuration. + +``` +conda env create -f tpot-cuml.yml -n tpot-cuml +conda activate tpot-cuml +``` + + ## Installation problems Please [file a new issue](https://github.com/EpistasisLab/tpot/issues/new) if you run into installation problems. diff --git a/docs_sources/releases.md b/docs_sources/releases.md index 724b90af..8e8f227b 100644 --- a/docs_sources/releases.md +++ b/docs_sources/releases.md @@ -1,5 +1,29 @@ # Release Notes +## Version 0.11.6 + +- Fix a bug causing point mutation function does not work properly with using `template` option +- Add a new built configuration called "TPOT cuML" which TPOT will search over a restricted configuration using the GPU-accelerated estimators in [RAPIDS cuML](https://github.com/rapidsai/cuml) and [DMLC XGBoost](https://github.com/dmlc/xgboost). **This configuration requires an NVIDIA Pascal architecture or better GPU with [compute capability 6.0+](https://developer.nvidia.com/cuda-gpus), and that the library cuML is installed.** +- Add string path support for log/log_file parameter +- Fix a bug in version 0.11.5 causing no update in stdout after each generation +- Fix minor bugs + + +## Version 0.11.5 + +- Make `Pytorch` as an optional dependency +- Refine installation documentation + +## Version 0.11.4 + +- Add a new built configuration "TPOT NN" which includes all operators in "Default TPOT" plus additional neural network estimators written in PyTorch (currently `tpot.builtins.PytorchLRClassifier` and `tpot.builtins.PytorchMLPClassifier` for classification tasks only) +- Refine `log_file` parameter's behavior + +## Version 0.11.3 + +- Fix a bug in TPOTRegressor in v0.11.2 +- Add `-log` option in command line interface to save process log to a file. + ## Version 0.11.2 - Fix `early_stop` parameter does not work properly diff --git a/docs_sources/using.md b/docs_sources/using.md index c0dde523..e3649159 100644 --- a/docs_sources/using.md +++ b/docs_sources/using.md @@ -450,7 +450,7 @@ This configuration works for both the TPOTClassifier and TPOTRegressor. -TPOT-NN +TPOT NN TPOT uses the same configuration as "Default TPOT" plus additional neural network estimators written in PyTorch (currently only `tpot.builtins.PytorchLRClassifier` and `tpot.builtins.PytorchMLPClassifier`).

Currently only classification is supported, but future releases will include regression estimators. @@ -458,7 +458,7 @@ Currently only classification is supported, but future releases will include reg -TPOT-cuML +TPOT cuML TPOT will search over a restricted configuration using the GPU-accelerated estimators in RAPIDS cuML and DMLC XGBoost. This configuration requires an NVIDIA Pascal architecture or better GPU with compute capability 6.0+, and that the library cuML is installed. With this configuration, all model training and predicting will be GPU-accelerated.

This configuration is particularly useful for medium-sized and larger datasets on which CPU-based estimators are a common bottleneck, and works for both the TPOTClassifier and TPOTRegressor. diff --git a/tpot-cuml.yml b/tpot-cuml.yml new file mode 100644 index 00000000..d60a3aa1 --- /dev/null +++ b/tpot-cuml.yml @@ -0,0 +1,17 @@ +channels: + - rapidsai + - nvidia + - conda-forge + - defaults +dependencies: + - python=3.7 + - cudatoolkit=10.2 + - cuml=0.16 + - scikit-learn + - ipython + - ipywidgets + - jupyterlab + - pip + - pip: + - xgboost + - tpot diff --git a/tpot/_version.py b/tpot/_version.py index 37374bff..929ad0b7 100644 --- a/tpot/_version.py +++ b/tpot/_version.py @@ -23,4 +23,4 @@ """ -__version__ = '0.11.5' +__version__ = '0.11.6' diff --git a/tpot/base.py b/tpot/base.py index b93a1306..289021f5 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -821,6 +821,20 @@ def _update_top_pipeline(self): self._optimized_pipeline_score = pipeline_scores.wvalues[1] if not self._optimized_pipeline: + # pick one individual from evaluated pipeline for a error message + eval_ind_list = list(self.evaluated_individuals_.keys()) + for pipeline, pipeline_scores in zip(self._pareto_front.items, reversed(self._pareto_front.keys)): + if np.isinf(pipeline_scores.wvalues[1]): + sklearn_pipeline = self._toolbox.compile(expr=pipeline) + from sklearn.model_selection import cross_val_score + cv_scores = cross_val_score(sklearn_pipeline, + self.pretest_X, + self.pretest_y, + cv=self.cv, + scoring=self.scoring_function, + verbose=0, + error_score="raise") + break raise RuntimeError('There was an error in the TPOT optimization ' 'process. This could be because the data was ' 'not formatted properly, or because data for ' From b441aaf487a8168c2c9316dd7fb968cb5600e752 Mon Sep 17 00:00:00 2001 From: Weixuan Fu Date: Mon, 26 Oct 2020 10:20:05 -0400 Subject: [PATCH 38/39] change error table --- docs_sources/examples.md | 2 ++ tests/nn_tests.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docs_sources/examples.md b/docs_sources/examples.md index 51a2bacc..559315e4 100644 --- a/docs_sources/examples.md +++ b/docs_sources/examples.md @@ -11,6 +11,8 @@ belonging to a typical class of machine learning tasks. | Titanic | survival analysis | classification | [link](https://www.kaggle.com/c/titanic/data) | [link](https://github.com/EpistasisLab/tpot/blob/master/tutorials/Titanic_Kaggle.ipynb) | | Bank Marketing | subscription prediction | classification | [link](https://archive.ics.uci.edu/ml/datasets/Bank+Marketing) | [link](https://github.com/EpistasisLab/tpot/blob/master/tutorials/Portuguese%20Bank%20Marketing/Portuguese%20Bank%20Marketing%20Strategy.ipynb) | | MAGIC Gamma Telescope | event detection | classification | [link](https://archive.ics.uci.edu/ml/datasets/MAGIC+Gamma+Telescope) | [link](https://github.com/EpistasisLab/tpot/blob/master/tutorials/MAGIC%20Gamma%20Telescope/MAGIC%20Gamma%20Telescope.ipynb) | +| cuML Classification Example | random classification problem | classification | [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html) | [link](https://github.com/EpistasisLab/tpot/blob/master/tutorials/cuML_Classification_Example.ipynb) | +| cuML Regression Example | random regression problem | regression | [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_regression.html) | [link](https://github.com/EpistasisLab/tpot/blob/master/tutorials/cuML_Regression_Example.ipynb) | **Notes:** - For details on how the `fit()`, `score()` and `export()` methods work, refer to the [usage documentation](/using/). diff --git a/tests/nn_tests.py b/tests/nn_tests.py index 2a460704..b9d657cd 100644 --- a/tests/nn_tests.py +++ b/tests/nn_tests.py @@ -68,7 +68,7 @@ def test_nn_errors_on_multiclass(): config_dict=classifier_config_nn, template='PytorchLRClassifier' ) - assert_raises(RuntimeError, clf.fit, multiclass_X, multiclass_y) + assert_raises(ValueError, clf.fit, multiclass_X, multiclass_y) def test_pytorch_lr_classifier(): """Assert that the PytorchLRClassifier model works. (NN)""" From c766c1b894195f6286562ec8da22637688b6d733 Mon Sep 17 00:00:00 2001 From: Weixuan Fu Date: Mon, 26 Oct 2020 10:33:19 -0400 Subject: [PATCH 39/39] mkdocs and update example --- docs/api/index.html | 40 ++++------ docs/citing/index.html | 9 +-- docs/examples/index.html | 35 +++++---- docs/index.html | 2 +- docs/installing/index.html | 34 ++++----- docs/releases/index.html | 33 +++++++- docs/search/search_index.json | 2 +- docs/sitemap.xml | 20 ++--- docs/sitemap.xml.gz | Bin 269 -> 269 bytes docs/using/index.html | 80 +++++++++----------- tutorials/cuML_Classification_Example.ipynb | 4 +- tutorials/cuML_Regression_Example.ipynb | 4 +- 12 files changed, 138 insertions(+), 125 deletions(-) diff --git a/docs/api/index.html b/docs/api/index.html index f478bc5e..ebf93c0a 100644 --- a/docs/api/index.html +++ b/docs/api/index.html @@ -147,7 +147,6 @@

Classification

disable_update_check=False, log_file=None ) -

Automated machine learning for supervised classification tasks.

@@ -352,10 +351,13 @@

Classification

The update checker will tell you when a new version of TPOT has been released. -log_file: io.TextIOWrapper or io.StringIO, optional (defaul: sys.stdout) +log_file: file-like class (io.TextIOWrapper or io.StringIO) or string, optional (default: None)

Save progress content to a file. +If it is a string for the path and file name of the desired output file, +TPOT will create the file and write log into it. +If it is None, TPOT will output log into sys.stdout
@@ -389,7 +391,7 @@

Classification

Example

-
from tpot import TPOTClassifier
+
from tpot import TPOTClassifier
 from sklearn.datasets import load_digits
 from sklearn.model_selection import train_test_split
 
@@ -402,7 +404,6 @@ 

Classification

print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py')
-

Functions

@@ -432,9 +433,8 @@

Classification

-
fit(features, classes, sample_weight=None, groups=None)
+
fit(features, classes, sample_weight=None, groups=None)
 
-
Run the TPOT optimization process on the given training data.

@@ -486,9 +486,8 @@

Classification

-
predict(features)
+
predict(features)
 
-
Use the optimized pipeline to predict the classes for a feature set.

@@ -515,9 +514,8 @@

Classification

-
predict_proba(features)
+
predict_proba(features)
 
-
Use the optimized pipeline to estimate the class probabilities for a feature set.

@@ -546,9 +544,8 @@

Classification

-
score(testing_features, testing_classes)
+
score(testing_features, testing_classes)
 
-
Returns the optimized pipeline's score on the given testing data using the user-specified scoring function.

@@ -582,9 +579,8 @@

Classification

-
export(output_file_name, data_file_path)
+
export(output_file_name, data_file_path)
 
-
Export the optimized pipeline as Python code.

@@ -631,7 +627,6 @@

Regression

early_stop=None, verbosity=0, disable_update_check=False)
-

Automated machine learning for supervised regression tasks.

@@ -868,7 +863,7 @@

Regression

Example

-
from tpot import TPOTRegressor
+
from tpot import TPOTRegressor
 from sklearn.datasets import load_boston
 from sklearn.model_selection import train_test_split
 
@@ -881,7 +876,6 @@ 

Regression

print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py')
-

Functions

@@ -906,9 +900,8 @@

Regression

-
fit(features, target, sample_weight=None, groups=None)
+
fit(features, target, sample_weight=None, groups=None)
 
-
Run the TPOT optimization process on the given training data.

@@ -960,9 +953,8 @@

Regression

-
predict(features)
+
predict(features)
 
-
Use the optimized pipeline to predict the target values for a feature set.

@@ -989,9 +981,8 @@

Regression

-
score(testing_features, testing_target)
+
score(testing_features, testing_target)
 
-
Returns the optimized pipeline's score on the given testing data using the user-specified scoring function.

@@ -1025,9 +1016,8 @@

Regression

-
export(output_file_name)
+
export(output_file_name)
 
-
Export the optimized pipeline as Python code.

diff --git a/docs/citing/index.html b/docs/citing/index.html index bc2c7360..c95b8779 100644 --- a/docs/citing/index.html +++ b/docs/citing/index.html @@ -128,7 +128,7 @@

Citing TPOT

If you use TPOT in a scientific publication, please consider citing at least one of the following papers:

Trang T. Le, Weixuan Fu and Jason H. Moore (2020). Scaling tree-based automated machine learning to biomedical big data with a feature set selector. Bioinformatics.36(1): 250-256.

BibTeX entry:

-
@article{le2020scaling,
+
@article{le2020scaling,
   title={Scaling tree-based automated machine learning to biomedical big data with a feature set selector},
   author={Le, Trang T and Fu, Weixuan and Moore, Jason H},
   journal={Bioinformatics},
@@ -139,10 +139,9 @@ 

Citing TPOT

publisher={Oxford University Press} }
-

Randal S. Olson, Ryan J. Urbanowicz, Peter C. Andrews, Nicole A. Lavender, La Creis Kidd, and Jason H. Moore (2016). Automating biomedical data science through tree-based pipeline optimization. Applications of Evolutionary Computation, pages 123-137.

BibTeX entry:

-
@inbook{Olson2016EvoBio,
+
@inbook{Olson2016EvoBio,
     author={Olson, Randal S. and Urbanowicz, Ryan J. and Andrews, Peter C. and Lavender, Nicole A. and Kidd, La Creis and Moore, Jason H.},
     editor={Squillero, Giovanni and Burelli, Paolo},
     chapter={Automating Biomedical Data Science Through Tree-Based Pipeline Optimization},
@@ -155,11 +154,10 @@ 

Citing TPOT

url={http://dx.doi.org/10.1007/978-3-319-31204-0_9} }
-

Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science

Randal S. Olson, Nathan Bartley, Ryan J. Urbanowicz, and Jason H. Moore (2016). Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science. Proceedings of GECCO 2016, pages 485-492.

BibTeX entry:

-
@inproceedings{OlsonGECCO2016,
+
@inproceedings{OlsonGECCO2016,
     author = {Olson, Randal S. and Bartley, Nathan and Urbanowicz, Ryan J. and Moore, Jason H.},
     title = {Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science},
     booktitle = {Proceedings of the Genetic and Evolutionary Computation Conference 2016},
@@ -176,7 +174,6 @@ 

Citing TPOT

address = {New York, NY, USA}, }
-

Alternatively, you can cite the repository directly with the following DOI:

DOI

diff --git a/docs/examples/index.html b/docs/examples/index.html index d542d455..d32f62a1 100644 --- a/docs/examples/index.html +++ b/docs/examples/index.html @@ -194,6 +194,20 @@

Overview

link link + +cuML Classification Example +random classification problem +classification +link +link + + +cuML Regression Example +random regression problem +regression +link +link +

Notes: @@ -201,7 +215,7 @@

Overview

- Upon re-running the experiments, your resulting pipelines may differ (to some extent) from the ones demonstrated here.

Iris flower classification

The following code illustrates how TPOT can be employed for performing a simple classification task over the Iris dataset.

-
from tpot import TPOTClassifier
+
from tpot import TPOTClassifier
 from sklearn.datasets import load_iris
 from sklearn.model_selection import train_test_split
 import numpy as np
@@ -215,9 +229,8 @@ 

Iris flower classification

print(tpot.score(X_test, y_test)) tpot.export('tpot_iris_pipeline.py')
-

Running this code should discover a pipeline (exported as tpot_iris_pipeline.py) that achieves about 97% test accuracy:

-
import numpy as np
+
import numpy as np
 import pandas as pd
 from sklearn.model_selection import train_test_split
 from sklearn.neighbors import KNeighborsClassifier
@@ -242,10 +255,9 @@ 

Iris flower classification

exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
-

Digits dataset

Below is a minimal working example with the optical recognition of handwritten digits dataset, which is an image classification problem.

-
from tpot import TPOTClassifier
+
from tpot import TPOTClassifier
 from sklearn.datasets import load_digits
 from sklearn.model_selection import train_test_split
 
@@ -258,9 +270,8 @@ 

Digits dataset

print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py')
-

Running this code should discover a pipeline (exported as tpot_digits_pipeline.py) that achieves about 98% test accuracy:

-
import numpy as np
+
import numpy as np
 import pandas as pd
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.linear_model import LogisticRegression
@@ -288,10 +299,9 @@ 

Digits dataset

exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
-

Boston housing prices modeling

The following code illustrates how TPOT can be employed for performing a regression task over the Boston housing prices dataset.

-
from tpot import TPOTRegressor
+
from tpot import TPOTRegressor
 from sklearn.datasets import load_boston
 from sklearn.model_selection import train_test_split
 
@@ -304,9 +314,8 @@ 

Boston housing prices modeling

print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py')
-

Running this code should discover a pipeline (exported as tpot_boston_pipeline.py) that achieves at least 10 mean squared error (MSE) on the test set:

-
import numpy as np
+
import numpy as np
 import pandas as pd
 from sklearn.ensemble import ExtraTreesRegressor
 from sklearn.model_selection import train_test_split
@@ -331,7 +340,6 @@ 

Boston housing prices modeling

exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
-

Titanic survival analysis

To see the TPOT applied the Titanic Kaggle dataset, see the Jupyter notebook here. This example shows how to take a messy dataset and preprocess it such that it can be used in scikit-learn and TPOT.

Portuguese Bank Marketing

@@ -340,7 +348,7 @@

MAGIC Gamma Telescope

The corresponding Jupyter notebook, containing the associated data preprocessing and analysis, can be found here.

Neural network classifier using TPOT-NN

By loading the TPOT-NN configuration dictionary, PyTorch estimators will be included for classification. Users can also create their own NN configuration dictionary that includes tpot.builtins.PytorchLRClassifier and/or tpot.builtins.PytorchMLPClassifier, or they can specify them using a template string, as shown in the following example:

-
from tpot import TPOTClassifier
+
from tpot import TPOTClassifier
 from sklearn.datasets import make_blobs
 from sklearn.model_selection import train_test_split
 
@@ -353,7 +361,6 @@ 

Neural network classifier using print(clf.score(X_test, y_test)) clf.export('tpot_nn_demo_pipeline.py')

-

This example is somewhat trivial, but it should result in nearly 100% classification accuracy.

diff --git a/docs/index.html b/docs/index.html index bbf901aa..faa9d2a0 100644 --- a/docs/index.html +++ b/docs/index.html @@ -204,5 +204,5 @@ diff --git a/docs/installing/index.html b/docs/installing/index.html index 86b24cb1..02550cc6 100644 --- a/docs/installing/index.html +++ b/docs/installing/index.html @@ -58,6 +58,8 @@
  • conda-forge
  • +
  • Installation for using TPOT-cuML configuration +
  • Installation problems
  • @@ -165,42 +167,40 @@

    Installation

    You can install TPOT using pip or conda-forge.

    pip

    NumPy, SciPy, scikit-learn, pandas, joblib, and PyTorch can be installed in Anaconda via the command:

    -
    conda install numpy scipy scikit-learn pandas joblib pytorch
    +
    conda install numpy scipy scikit-learn pandas joblib pytorch
     
    -

    DEAP, update_checker, tqdm and stopit can be installed with pip via the command:

    -
    pip install deap update_checker tqdm stopit
    +
    pip install deap update_checker tqdm stopit
     
    -

    Optionally, you can install XGBoost if you would like TPOT to use the eXtreme Gradient Boosting models. XGBoost is entirely optional, and TPOT will still function normally without XGBoost if you do not have it installed. Windows users: pip installation may not work on some Windows environments, and it may cause unexpected errors.

    -
    pip install xgboost
    +
    pip install xgboost
     
    -

    If you have issues installing XGBoost, check the XGBoost installation documentation.

    -

    If you plan to use Dask for parallel training, make sure to install dask[delay] and dask[dataframe] and dask_ml.

    -
    pip install dask[delayed] dask[dataframe] dask-ml fsspec>=0.3.3
    +

    If you plan to use Dask for parallel training, make sure to install dask[delay] and dask[dataframe] and dask_ml. It is noted that dask-ml>=1.7 requires distributed>=2.4.0 and scikit-learn>=0.23.0.

    +
    pip install dask[delayed] dask[dataframe] dask-ml fsspec>=0.3.3 distributed>=2.10.0
     
    -

    If you plan to use the TPOT-MDR configuration, make sure to install scikit-mdr and scikit-rebate:

    -
    pip install scikit-mdr skrebate
    +
    pip install scikit-mdr skrebate
     
    -

    To enable support for PyTorch-based neural networks (TPOT-NN), you will need to install PyTorch. TPOT-NN will work with either CPU or GPU PyTorch, but we strongly recommend using a GPU version, if possible, as CPU PyTorch models tend to train very slowly.

    We recommend following PyTorch's installation instructions customized for your operating system and Python distribution.

    Finally to install TPOT itself, run the following command:

    -
    pip install tpot
    +
    pip install tpot
     
    -

    conda-forge

    To install tpot and its core dependencies you can use:

    -
    conda install -c conda-forge tpot
    +
    conda install -c conda-forge tpot
     
    -

    To install additional dependencies you can use:

    -
    conda install -c conda-forge tpot xgboost dask dask-ml scikit-mdr skrebate
    +
    conda install -c conda-forge tpot xgboost dask dask-ml scikit-mdr skrebate
     
    -

    As mentioned above, we recommend following PyTorch's installation instructions for installing it to enable support for PyTorch-based neural networks (TPOT-NN).

    +

    Installation for using TPOT-cuML configuration

    +

    With "TPOT cuML" configuration (see built-in configurations), TPOT will search over a restricted configuration using the GPU-accelerated estimators in RAPIDS cuML and DMLC XGBoost. This configuration requires an NVIDIA Pascal architecture or better GPU with compute capability 6.0+, and that the library cuML is installed. With this configuration, all model training and predicting will be GPU-accelerated. This configuration is particularly useful for medium-sized and larger datasets on which CPU-based estimators are a common bottleneck, and works for both the TPOTClassifier and TPOTRegressor.

    +

    Please download this conda environment yml file to install TPOT for using TPOT-cuML configuration.

    +
    conda env create -f tpot-cuml.yml -n tpot-cuml
    +conda activate tpot-cuml
    +

    Installation problems

    Please file a new issue if you run into installation problems.

    diff --git a/docs/releases/index.html b/docs/releases/index.html index 783754bf..cba285e7 100644 --- a/docs/releases/index.html +++ b/docs/releases/index.html @@ -74,6 +74,14 @@
    • Release Notes
        +
      • Version 0.11.6 +
      • +
      • Version 0.11.5 +
      • +
      • Version 0.11.4 +
      • +
      • Version 0.11.3 +
      • Version 0.11.2
      • Version 0.11.1 @@ -159,6 +167,29 @@

        Release Notes

        +

        Version 0.11.6

        +
          +
        • Fix a bug causing point mutation function does not work properly with using template option
        • +
        • Add a new built configuration called "TPOT cuML" which TPOT will search over a restricted configuration using the GPU-accelerated estimators in RAPIDS cuML and DMLC XGBoost. This configuration requires an NVIDIA Pascal architecture or better GPU with compute capability 6.0+, and that the library cuML is installed.
        • +
        • Add string path support for log/log_file parameter
        • +
        • Fix a bug in version 0.11.5 causing no update in stdout after each generation
        • +
        • Fix minor bugs
        • +
        +

        Version 0.11.5

        +
          +
        • Make Pytorch as an optional dependency
        • +
        • Refine installation documentation
        • +
        +

        Version 0.11.4

        +
          +
        • Add a new built configuration "TPOT NN" which includes all operators in "Default TPOT" plus additional neural network estimators written in PyTorch (currently tpot.builtins.PytorchLRClassifier and tpot.builtins.PytorchMLPClassifier for classification tasks only)
        • +
        • Refine log_file parameter's behavior
        • +
        +

        Version 0.11.3

        +
          +
        • Fix a bug in TPOTRegressor in v0.11.2
        • +
        • Add -log option in command line interface to save process log to a file.
        • +

        Version 0.11.2

        • Fix early_stop parameter does not work properly
        • @@ -240,7 +271,7 @@

          Version 0.9.5

          We refined parameters in VarianceThreshold and FeatureAgglomeration.

        • -

          TPOT now supports using memory caching within a Pipeline via a optional memory parameter.

          +

          TPOT now supports using memory caching within a Pipeline via an optional memory parameter.

        • We improved documentation of TPOT.

          diff --git a/docs/search/search_index.json b/docs/search/search_index.json index 96a967c0..63254711 100644 --- a/docs/search/search_index.json +++ b/docs/search/search_index.json @@ -1 +1 @@ -{"config":{"lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"Consider TPOT your Data Science Assistant . TPOT is a Python Automated Machine Learning tool that optimizes machine learning pipelines using genetic programming. TPOT will automate the most tedious part of machine learning by intelligently exploring thousands of possible pipelines to find the best one for your data. An example machine learning pipeline Once TPOT is finished searching (or you get tired of waiting), it provides you with the Python code for the best pipeline it found so you can tinker with the pipeline from there. An example TPOT pipeline TPOT is built on top of scikit-learn, so all of the code it generates should look familiar... if you're familiar with scikit-learn, anyway. TPOT is still under active development and we encourage you to check back on this repository regularly for updates.","title":"Home"},{"location":"api/","text":"TPOT API Classification class tpot. TPOTClassifier ( generations =100, population_size =100, offspring_size =None, mutation_rate =0.9, crossover_rate =0.1, scoring ='accuracy', cv =5, subsample =1.0, n_jobs =1, max_time_mins =None, max_eval_time_mins =5, random_state =None, config_dict =None, template =None, warm_start =False, memory =None, use_dask =False, periodic_checkpoint_folder =None, early_stop =None, verbosity =0, disable_update_check =False, log_file =None ) source Automated machine learning for supervised classification tasks. The TPOTClassifier performs an intelligent search over machine learning pipelines that can contain supervised classification models, preprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API . The TPOTClassifier will also search over the hyperparameters of all objects in the pipeline. By default, TPOTClassifier will search over a broad range of supervised classification algorithms, transformers, and their parameters. However, the algorithms, transformers, and hyperparameters that the TPOTClassifier searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int or None optional (default=100) Number of iterations to the run pipeline optimization process. It must be a positive number or None. If None, the parameter max_time_mins must be defined as the runtime limit. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) Number of individuals to retain in the genetic programming population every generation. Must be a positive number. Generally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=None) Number of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size. mutation_rate : float, optional (default=0.9) Mutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) Crossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='accuracy') Function used to evaluate the quality of a given pipeline for the classification problem. The following built-in scoring functions can be used: 'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'precision' etc. (suffixes apply as with \u2018f1\u2019), 'recall' etc. (suffixes apply as with \u2018f1\u2019), \u2018jaccard\u2019 etc. (suffixes apply as with \u2018f1\u2019), 'roc_auc', \u2018roc_auc_ovr\u2019, \u2018roc_auc_ovo\u2019, \u2018roc_auc_ovr_weighted\u2019, \u2018roc_auc_ovo_weighted\u2019 If you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y) . See the section on scoring functions for more details. cv : int, cross-validation generator, or an iterable, optional (default=5) Cross-validation strategy used when evaluating pipelines. Possible inputs: integer, to specify the number of folds in a StratifiedKFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) Fraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. Setting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) Number of processes to use in parallel for evaluating pipelines during the TPOT optimization process. Setting n_jobs =-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets. max_time_mins : integer or None, optional (default=None) How many minutes TPOT has to optimize the pipeline. If not None, this setting will allow TPOT to run until max_time_mins minutes elapsed and then stop. TPOT will stop earlier if generations is set and all generations are already evaluated. max_eval_time_mins : float, optional (default=5) How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) The seed of the pseudo random number generator used in TPOT. Use this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) A configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. Possible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTClassifier configuration. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. template : string (default=None) Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. warm_start : boolean, optional (default=False) Flag indicating whether the TPOT instance will reuse the population from previous calls to fit() . Setting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. memory : a joblib.Memory object or string, optional (default=None) If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in scikit-learn documentation Possible inputs are: String 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or Path of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or Memory object, TPOT uses the instance of joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or None, TPOT does not use memory caching. use_dask : boolean, optional (default: False) Whether to use Dask-ML's pipeline optimiziations. This avoid re-fitting the same estimator on the same split of data multiple times. It will also provide more detailed diagnostics when using Dask's distributed scheduler. See avoid repeated work for more details. periodic_checkpoint_folder : path string, optional (default: None) If supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing. Currently once per generation but not more often than once per 30 seconds. Useful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) How many generations TPOT checks whether there is no improvement in optimization process. Ends the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) How much information TPOT communicates while it's running. Possible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) Flag indicating whether the TPOT version checker should be disabled. The update checker will tell you when a new version of TPOT has been released. log_file : io.TextIOWrapper or io.StringIO, optional (defaul: sys.stdout) Save progress content to a file. Attributes: fitted_pipeline_ : scikit-learn Pipeline object The best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary Dictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. The TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. Note: pareto_front_fitted_pipelines_ is only available when verbosity =3. evaluated_individuals_ : Python dictionary Dictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). This attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Functions fit (features, classes[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the classes for a feature set. predict_proba (features) Use the optimized pipeline to estimate the class probabilities for a feature set. score (testing_features, testing_classes) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, classes, sample_weight=None, groups=None) Run the TPOT optimization process on the given training data. Uses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples. Parameters: features : array-like {n_samples, n_features} Feature matrix TPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values. As such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed) using median value imputation . If you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. classes : array-like {n_samples} List of class labels for prediction sample_weight : array-like {n_samples}, optional Per-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines. groups : array-like, with shape {n_samples, }, optional Group labels for the samples used when performing cross-validation. This parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object Returns a copy of the fitted TPOT object predict(features) Use the optimized pipeline to predict the classes for a feature set. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples} Predicted classes for the samples in the feature matrix predict_proba(features) Use the optimized pipeline to estimate the class probabilities for a feature set. Note: This function will only work for pipelines whose final classifier supports the predict_proba function. TPOT will raise an error otherwise. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples, n_classes} The class probabilities of the input samples score(testing_features, testing_classes) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. The default scoring function for TPOTClassifier is 'accuracy'. Parameters: testing_features : array-like {n_samples, n_features} Feature matrix of the testing set testing_classes : array-like {n_samples} List of class labels for prediction in the testing set Returns: accuracy_score : float The estimated test set accuracy according to the user-specified scoring function. export(output_file_name, data_file_path) Export the optimized pipeline as Python code. See the usage documentation for example usage of the export function. Parameters: output_file_name : string String containing the path and file name of the desired output file data_file_path : string By default, the path of input dataset is 'PATH/TO/DATA/FILE' by default. If data_file_path is another string, the path will be replaced. Returns: exported_code_string : string The whole pipeline text as a string should be returned if output_file_name is not specified. Regression class tpot. TPOTRegressor ( generations =100, population_size =100, offspring_size =None, mutation_rate =0.9, crossover_rate =0.1, scoring ='neg_mean_squared_error', cv =5, subsample =1.0, n_jobs =1, max_time_mins =None, max_eval_time_mins =5, random_state =None, config_dict =None, template =None, warm_start =False, memory =None, use_dask =False, periodic_checkpoint_folder =None, early_stop =None, verbosity =0, disable_update_check =False ) source Automated machine learning for supervised regression tasks. The TPOTRegressor performs an intelligent search over machine learning pipelines that can contain supervised regression models, preprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API . The TPOTRegressor will also search over the hyperparameters of all objects in the pipeline. By default, TPOTRegressor will search over a broad range of supervised regression models, transformers, and their hyperparameters. However, the models, transformers, and parameters that the TPOTRegressor searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int or None, optional (default=100) Number of iterations to the run pipeline optimization process. It must be a positive number or None. If None, the parameter max_time_mins must be defined as the runtime limit. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) Number of individuals to retain in the genetic programming population every generation. Must be a positive number. Generally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=None) Number of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size. mutation_rate : float, optional (default=0.9) Mutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) Crossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='neg_mean_squared_error') Function used to evaluate the quality of a given pipeline for the regression problem. The following built-in scoring functions can be used: 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'r2' Note that we recommend using the neg version of mean squared error and related metrics so TPOT will minimize (instead of maximize) the metric. If you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y) . See the section on scoring functions for more details. cv : int, cross-validation generator, or an iterable, optional (default=5) Cross-validation strategy used when evaluating pipelines. Possible inputs: integer, to specify the number of folds in a KFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) Fraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. Setting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) Number of processes to use in parallel for evaluating pipelines during the TPOT optimization process. Setting n_jobs =-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets max_time_mins : integer or None, optional (default=None) How many minutes TPOT has to optimize the pipeline. If not None, this setting will allow TPOT to run until max_time_mins minutes elapsed and then stop. TPOT will stop earlier if generations is set and all generations are already evaluated. max_eval_time_mins : float, optional (default=5) How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) The seed of the pseudo random number generator used in TPOT. Use this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) A configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. Possible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTRegressor configuration. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. template : string (default=None) Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Regressor\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. warm_start : boolean, optional (default=False) Flag indicating whether the TPOT instance will reuse the population from previous calls to fit() . Setting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. memory : a joblib.Memory object or string, optional (default=None) If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in scikit-learn documentation Possible inputs are: String 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or Path of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or Memory object, TPOT uses the instance of joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or None, TPOT does not use memory caching. use_dask : boolean, optional (default: False) Whether to use Dask-ML's pipeline optimiziations. This avoid re-fitting the same estimator on the same split of data multiple times. It will also provide more detailed diagnostics when using Dask's distributed scheduler. See avoid repeated work for more details. periodic_checkpoint_folder : path string, optional (default: None) If supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing. Currently once per generation but not more often than once per 30 seconds. Useful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) How many generations TPOT checks whether there is no improvement in optimization process. Ends the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) How much information TPOT communicates while it's running. Possible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) Flag indicating whether the TPOT version checker should be disabled. The update checker will tell you when a new version of TPOT has been released. Attributes: fitted_pipeline_ : scikit-learn Pipeline object The best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary Dictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. The TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. Note: _pareto_front_fitted_pipelines is only available when verbosity =3. evaluated_individuals_ : Python dictionary Dictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). This attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTRegressor from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split digits = load_boston() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py') Functions fit (features, target[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the target values for a feature set. score (testing_features, testing_target) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, target, sample_weight=None, groups=None) Run the TPOT optimization process on the given training data. Uses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples. Parameters: features : array-like {n_samples, n_features} Feature matrix TPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values. As such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed) using median value imputation . If you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. target : array-like {n_samples} List of target labels for prediction sample_weight : array-like {n_samples}, optional Per-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines. groups : array-like, with shape {n_samples, }, optional Group labels for the samples used when performing cross-validation. This parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object Returns a copy of the fitted TPOT object predict(features) Use the optimized pipeline to predict the target values for a feature set. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples} Predicted target values for the samples in the feature matrix score(testing_features, testing_target) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. The default scoring function for TPOTRegressor is 'mean_squared_error'. Parameters: testing_features : array-like {n_samples, n_features} Feature matrix of the testing set testing_target : array-like {n_samples} List of target labels for prediction in the testing set Returns: accuracy_score : float The estimated test set accuracy according to the user-specified scoring function. export(output_file_name) Export the optimized pipeline as Python code. See the usage documentation for example usage of the export function. Parameters: output_file_name : string String containing the path and file name of the desired output file data_file_path : string By default, the path of input dataset is 'PATH/TO/DATA/FILE' by default. If data_file_path is another string, the path will be replaced. Returns: exported_code_string : string The whole pipeline text as a string should be returned if output_file_name is not specified.","title":"TPOT API"},{"location":"api/#tpot-api","text":"","title":"TPOT API"},{"location":"api/#classification","text":"class tpot. TPOTClassifier ( generations =100, population_size =100, offspring_size =None, mutation_rate =0.9, crossover_rate =0.1, scoring ='accuracy', cv =5, subsample =1.0, n_jobs =1, max_time_mins =None, max_eval_time_mins =5, random_state =None, config_dict =None, template =None, warm_start =False, memory =None, use_dask =False, periodic_checkpoint_folder =None, early_stop =None, verbosity =0, disable_update_check =False, log_file =None ) source Automated machine learning for supervised classification tasks. The TPOTClassifier performs an intelligent search over machine learning pipelines that can contain supervised classification models, preprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API . The TPOTClassifier will also search over the hyperparameters of all objects in the pipeline. By default, TPOTClassifier will search over a broad range of supervised classification algorithms, transformers, and their parameters. However, the algorithms, transformers, and hyperparameters that the TPOTClassifier searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int or None optional (default=100) Number of iterations to the run pipeline optimization process. It must be a positive number or None. If None, the parameter max_time_mins must be defined as the runtime limit. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) Number of individuals to retain in the genetic programming population every generation. Must be a positive number. Generally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=None) Number of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size. mutation_rate : float, optional (default=0.9) Mutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) Crossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='accuracy') Function used to evaluate the quality of a given pipeline for the classification problem. The following built-in scoring functions can be used: 'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'precision' etc. (suffixes apply as with \u2018f1\u2019), 'recall' etc. (suffixes apply as with \u2018f1\u2019), \u2018jaccard\u2019 etc. (suffixes apply as with \u2018f1\u2019), 'roc_auc', \u2018roc_auc_ovr\u2019, \u2018roc_auc_ovo\u2019, \u2018roc_auc_ovr_weighted\u2019, \u2018roc_auc_ovo_weighted\u2019 If you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y) . See the section on scoring functions for more details. cv : int, cross-validation generator, or an iterable, optional (default=5) Cross-validation strategy used when evaluating pipelines. Possible inputs: integer, to specify the number of folds in a StratifiedKFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) Fraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. Setting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) Number of processes to use in parallel for evaluating pipelines during the TPOT optimization process. Setting n_jobs =-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets. max_time_mins : integer or None, optional (default=None) How many minutes TPOT has to optimize the pipeline. If not None, this setting will allow TPOT to run until max_time_mins minutes elapsed and then stop. TPOT will stop earlier if generations is set and all generations are already evaluated. max_eval_time_mins : float, optional (default=5) How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) The seed of the pseudo random number generator used in TPOT. Use this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) A configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. Possible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTClassifier configuration. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. template : string (default=None) Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. warm_start : boolean, optional (default=False) Flag indicating whether the TPOT instance will reuse the population from previous calls to fit() . Setting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. memory : a joblib.Memory object or string, optional (default=None) If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in scikit-learn documentation Possible inputs are: String 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or Path of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or Memory object, TPOT uses the instance of joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or None, TPOT does not use memory caching. use_dask : boolean, optional (default: False) Whether to use Dask-ML's pipeline optimiziations. This avoid re-fitting the same estimator on the same split of data multiple times. It will also provide more detailed diagnostics when using Dask's distributed scheduler. See avoid repeated work for more details. periodic_checkpoint_folder : path string, optional (default: None) If supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing. Currently once per generation but not more often than once per 30 seconds. Useful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) How many generations TPOT checks whether there is no improvement in optimization process. Ends the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) How much information TPOT communicates while it's running. Possible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) Flag indicating whether the TPOT version checker should be disabled. The update checker will tell you when a new version of TPOT has been released. log_file : io.TextIOWrapper or io.StringIO, optional (defaul: sys.stdout) Save progress content to a file. Attributes: fitted_pipeline_ : scikit-learn Pipeline object The best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary Dictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. The TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. Note: pareto_front_fitted_pipelines_ is only available when verbosity =3. evaluated_individuals_ : Python dictionary Dictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). This attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Functions fit (features, classes[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the classes for a feature set. predict_proba (features) Use the optimized pipeline to estimate the class probabilities for a feature set. score (testing_features, testing_classes) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, classes, sample_weight=None, groups=None) Run the TPOT optimization process on the given training data. Uses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples. Parameters: features : array-like {n_samples, n_features} Feature matrix TPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values. As such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed) using median value imputation . If you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. classes : array-like {n_samples} List of class labels for prediction sample_weight : array-like {n_samples}, optional Per-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines. groups : array-like, with shape {n_samples, }, optional Group labels for the samples used when performing cross-validation. This parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object Returns a copy of the fitted TPOT object predict(features) Use the optimized pipeline to predict the classes for a feature set. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples} Predicted classes for the samples in the feature matrix predict_proba(features) Use the optimized pipeline to estimate the class probabilities for a feature set. Note: This function will only work for pipelines whose final classifier supports the predict_proba function. TPOT will raise an error otherwise. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples, n_classes} The class probabilities of the input samples score(testing_features, testing_classes) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. The default scoring function for TPOTClassifier is 'accuracy'. Parameters: testing_features : array-like {n_samples, n_features} Feature matrix of the testing set testing_classes : array-like {n_samples} List of class labels for prediction in the testing set Returns: accuracy_score : float The estimated test set accuracy according to the user-specified scoring function. export(output_file_name, data_file_path) Export the optimized pipeline as Python code. See the usage documentation for example usage of the export function. Parameters: output_file_name : string String containing the path and file name of the desired output file data_file_path : string By default, the path of input dataset is 'PATH/TO/DATA/FILE' by default. If data_file_path is another string, the path will be replaced. Returns: exported_code_string : string The whole pipeline text as a string should be returned if output_file_name is not specified.","title":"Classification"},{"location":"api/#regression","text":"class tpot. TPOTRegressor ( generations =100, population_size =100, offspring_size =None, mutation_rate =0.9, crossover_rate =0.1, scoring ='neg_mean_squared_error', cv =5, subsample =1.0, n_jobs =1, max_time_mins =None, max_eval_time_mins =5, random_state =None, config_dict =None, template =None, warm_start =False, memory =None, use_dask =False, periodic_checkpoint_folder =None, early_stop =None, verbosity =0, disable_update_check =False ) source Automated machine learning for supervised regression tasks. The TPOTRegressor performs an intelligent search over machine learning pipelines that can contain supervised regression models, preprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API . The TPOTRegressor will also search over the hyperparameters of all objects in the pipeline. By default, TPOTRegressor will search over a broad range of supervised regression models, transformers, and their hyperparameters. However, the models, transformers, and parameters that the TPOTRegressor searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int or None, optional (default=100) Number of iterations to the run pipeline optimization process. It must be a positive number or None. If None, the parameter max_time_mins must be defined as the runtime limit. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) Number of individuals to retain in the genetic programming population every generation. Must be a positive number. Generally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=None) Number of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size. mutation_rate : float, optional (default=0.9) Mutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) Crossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='neg_mean_squared_error') Function used to evaluate the quality of a given pipeline for the regression problem. The following built-in scoring functions can be used: 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'r2' Note that we recommend using the neg version of mean squared error and related metrics so TPOT will minimize (instead of maximize) the metric. If you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y) . See the section on scoring functions for more details. cv : int, cross-validation generator, or an iterable, optional (default=5) Cross-validation strategy used when evaluating pipelines. Possible inputs: integer, to specify the number of folds in a KFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) Fraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. Setting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) Number of processes to use in parallel for evaluating pipelines during the TPOT optimization process. Setting n_jobs =-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets max_time_mins : integer or None, optional (default=None) How many minutes TPOT has to optimize the pipeline. If not None, this setting will allow TPOT to run until max_time_mins minutes elapsed and then stop. TPOT will stop earlier if generations is set and all generations are already evaluated. max_eval_time_mins : float, optional (default=5) How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) The seed of the pseudo random number generator used in TPOT. Use this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) A configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. Possible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTRegressor configuration. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. template : string (default=None) Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Regressor\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. warm_start : boolean, optional (default=False) Flag indicating whether the TPOT instance will reuse the population from previous calls to fit() . Setting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. memory : a joblib.Memory object or string, optional (default=None) If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in scikit-learn documentation Possible inputs are: String 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or Path of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or Memory object, TPOT uses the instance of joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or None, TPOT does not use memory caching. use_dask : boolean, optional (default: False) Whether to use Dask-ML's pipeline optimiziations. This avoid re-fitting the same estimator on the same split of data multiple times. It will also provide more detailed diagnostics when using Dask's distributed scheduler. See avoid repeated work for more details. periodic_checkpoint_folder : path string, optional (default: None) If supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing. Currently once per generation but not more often than once per 30 seconds. Useful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) How many generations TPOT checks whether there is no improvement in optimization process. Ends the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) How much information TPOT communicates while it's running. Possible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) Flag indicating whether the TPOT version checker should be disabled. The update checker will tell you when a new version of TPOT has been released. Attributes: fitted_pipeline_ : scikit-learn Pipeline object The best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary Dictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. The TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. Note: _pareto_front_fitted_pipelines is only available when verbosity =3. evaluated_individuals_ : Python dictionary Dictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). This attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTRegressor from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split digits = load_boston() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py') Functions fit (features, target[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the target values for a feature set. score (testing_features, testing_target) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, target, sample_weight=None, groups=None) Run the TPOT optimization process on the given training data. Uses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples. Parameters: features : array-like {n_samples, n_features} Feature matrix TPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values. As such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed) using median value imputation . If you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. target : array-like {n_samples} List of target labels for prediction sample_weight : array-like {n_samples}, optional Per-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines. groups : array-like, with shape {n_samples, }, optional Group labels for the samples used when performing cross-validation. This parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object Returns a copy of the fitted TPOT object predict(features) Use the optimized pipeline to predict the target values for a feature set. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples} Predicted target values for the samples in the feature matrix score(testing_features, testing_target) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. The default scoring function for TPOTRegressor is 'mean_squared_error'. Parameters: testing_features : array-like {n_samples, n_features} Feature matrix of the testing set testing_target : array-like {n_samples} List of target labels for prediction in the testing set Returns: accuracy_score : float The estimated test set accuracy according to the user-specified scoring function. export(output_file_name) Export the optimized pipeline as Python code. See the usage documentation for example usage of the export function. Parameters: output_file_name : string String containing the path and file name of the desired output file data_file_path : string By default, the path of input dataset is 'PATH/TO/DATA/FILE' by default. If data_file_path is another string, the path will be replaced. Returns: exported_code_string : string The whole pipeline text as a string should be returned if output_file_name is not specified.","title":"Regression"},{"location":"citing/","text":"Citing TPOT If you use TPOT in a scientific publication, please consider citing at least one of the following papers: Trang T. Le, Weixuan Fu and Jason H. Moore (2020). Scaling tree-based automated machine learning to biomedical big data with a feature set selector . Bioinformatics .36(1): 250-256. BibTeX entry: @article{le2020scaling, title={Scaling tree-based automated machine learning to biomedical big data with a feature set selector}, author={Le, Trang T and Fu, Weixuan and Moore, Jason H}, journal={Bioinformatics}, volume={36}, number={1}, pages={250--256}, year={2020}, publisher={Oxford University Press} } Randal S. Olson, Ryan J. Urbanowicz, Peter C. Andrews, Nicole A. Lavender, La Creis Kidd, and Jason H. Moore (2016). Automating biomedical data science through tree-based pipeline optimization . Applications of Evolutionary Computation , pages 123-137. BibTeX entry: @inbook{Olson2016EvoBio, author={Olson, Randal S. and Urbanowicz, Ryan J. and Andrews, Peter C. and Lavender, Nicole A. and Kidd, La Creis and Moore, Jason H.}, editor={Squillero, Giovanni and Burelli, Paolo}, chapter={Automating Biomedical Data Science Through Tree-Based Pipeline Optimization}, title={Applications of Evolutionary Computation: 19th European Conference, EvoApplications 2016, Porto, Portugal, March 30 -- April 1, 2016, Proceedings, Part I}, year={2016}, publisher={Springer International Publishing}, pages={123--137}, isbn={978-3-319-31204-0}, doi={10.1007/978-3-319-31204-0_9}, url={http://dx.doi.org/10.1007/978-3-319-31204-0_9} } Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science Randal S. Olson, Nathan Bartley, Ryan J. Urbanowicz, and Jason H. Moore (2016). Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science . Proceedings of GECCO 2016 , pages 485-492. BibTeX entry: @inproceedings{OlsonGECCO2016, author = {Olson, Randal S. and Bartley, Nathan and Urbanowicz, Ryan J. and Moore, Jason H.}, title = {Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science}, booktitle = {Proceedings of the Genetic and Evolutionary Computation Conference 2016}, series = {GECCO '16}, year = {2016}, isbn = {978-1-4503-4206-3}, location = {Denver, Colorado, USA}, pages = {485--492}, numpages = {8}, url = {http://doi.acm.org/10.1145/2908812.2908918}, doi = {10.1145/2908812.2908918}, acmid = {2908918}, publisher = {ACM}, address = {New York, NY, USA}, } Alternatively, you can cite the repository directly with the following DOI: DOI","title":"Citing TPOT"},{"location":"citing/#citing-tpot","text":"If you use TPOT in a scientific publication, please consider citing at least one of the following papers: Trang T. Le, Weixuan Fu and Jason H. Moore (2020). Scaling tree-based automated machine learning to biomedical big data with a feature set selector . Bioinformatics .36(1): 250-256. BibTeX entry: @article{le2020scaling, title={Scaling tree-based automated machine learning to biomedical big data with a feature set selector}, author={Le, Trang T and Fu, Weixuan and Moore, Jason H}, journal={Bioinformatics}, volume={36}, number={1}, pages={250--256}, year={2020}, publisher={Oxford University Press} } Randal S. Olson, Ryan J. Urbanowicz, Peter C. Andrews, Nicole A. Lavender, La Creis Kidd, and Jason H. Moore (2016). Automating biomedical data science through tree-based pipeline optimization . Applications of Evolutionary Computation , pages 123-137. BibTeX entry: @inbook{Olson2016EvoBio, author={Olson, Randal S. and Urbanowicz, Ryan J. and Andrews, Peter C. and Lavender, Nicole A. and Kidd, La Creis and Moore, Jason H.}, editor={Squillero, Giovanni and Burelli, Paolo}, chapter={Automating Biomedical Data Science Through Tree-Based Pipeline Optimization}, title={Applications of Evolutionary Computation: 19th European Conference, EvoApplications 2016, Porto, Portugal, March 30 -- April 1, 2016, Proceedings, Part I}, year={2016}, publisher={Springer International Publishing}, pages={123--137}, isbn={978-3-319-31204-0}, doi={10.1007/978-3-319-31204-0_9}, url={http://dx.doi.org/10.1007/978-3-319-31204-0_9} } Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science Randal S. Olson, Nathan Bartley, Ryan J. Urbanowicz, and Jason H. Moore (2016). Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science . Proceedings of GECCO 2016 , pages 485-492. BibTeX entry: @inproceedings{OlsonGECCO2016, author = {Olson, Randal S. and Bartley, Nathan and Urbanowicz, Ryan J. and Moore, Jason H.}, title = {Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science}, booktitle = {Proceedings of the Genetic and Evolutionary Computation Conference 2016}, series = {GECCO '16}, year = {2016}, isbn = {978-1-4503-4206-3}, location = {Denver, Colorado, USA}, pages = {485--492}, numpages = {8}, url = {http://doi.acm.org/10.1145/2908812.2908918}, doi = {10.1145/2908812.2908918}, acmid = {2908918}, publisher = {ACM}, address = {New York, NY, USA}, } Alternatively, you can cite the repository directly with the following DOI: DOI","title":"Citing TPOT"},{"location":"contributing/","text":"Contribution Guide We welcome you to check the existing issues for bugs or enhancements to work on. If you have an idea for an extension to TPOT, please file a new issue so we can discuss it. Project layout The latest stable release of TPOT is on the master branch , whereas the latest version of TPOT in development is on the development branch . Make sure you are looking at and working on the correct branch if you're looking to contribute code. In terms of directory structure: All of TPOT's code sources are in the tpot directory The documentation sources are in the docs_sources directory Images in the documentation are in the images directory Tutorials for TPOT are in the tutorials directory Unit tests for TPOT are in the tests.py file Make sure to familiarize yourself with the project layout before making any major contributions, and especially make sure to send all code changes to the development branch. How to contribute The preferred way to contribute to TPOT is to fork the main repository on GitHub: Fork the project repository : click on the 'Fork' button near the top of the page. This creates a copy of the code under your account on the GitHub server. Clone this copy to your local disk: $ git clone git@github.com:YourUsername/tpot.git $ cd tpot Create a branch to hold your changes: $ git checkout -b my-contribution Make sure your local environment is setup correctly for development. Installation instructions are almost identical to the user instructions except that TPOT should not be installed. If you have TPOT installed on your computer then make sure you are using a virtual environment that does not have TPOT installed. Furthermore, you should make sure you have installed the nose package into your development environment so that you can test changes locally. $ conda install nose Start making changes on your newly created branch, remembering to never work on the master branch! Work on this copy on your computer using Git to do the version control. Once some changes are saved locally, you can use your tweaked version of TPOT by navigating to the project's base directory and running TPOT directly from the command line: $ python -m tpot.driver or by running script that imports and uses the TPOT module with code similar to from tpot import TPOTClassifier To check your changes haven't broken any existing tests and to check new tests you've added pass run the following (note, you must have the nose package installed within your dev environment for this to work): $ nosetests -s -v When you're done editing and local testing, run: $ git add modified_files $ git commit to record your changes in Git, then push them to GitHub with: $ git push -u origin my-contribution Finally, go to the web page of your fork of the TPOT repo, and click 'Pull Request' (PR) to send your changes to the maintainers for review. Make sure that you send your PR to the development branch, as the master branch is reserved for the latest stable release. This will start the CI server to check all the project's unit tests run and send an email to the maintainers. (If any of the above seems like magic to you, then look up the Git documentation on the web.) Before submitting your pull request Before you submit a pull request for your contribution, please work through this checklist to make sure that you have done everything necessary so we can efficiently review and accept your changes. If your contribution changes TPOT in any way: Update the documentation so all of your changes are reflected there. Update the README if anything there has changed. If your contribution involves any code changes: Update the project unit tests to test your code changes. Make sure that your code is properly commented with docstrings and comments explaining your rationale behind non-obvious coding practices. If your code affected any of the pipeline operators, make sure that the corresponding export functionality reflects those changes. If your contribution requires a new library dependency: Double-check that the new dependency is easy to install via pip or Anaconda and supports both Python 2 and 3. If the dependency requires a complicated installation, then we most likely won't merge your changes because we want to keep TPOT easy to install. Add the required version of the library to .travis.yml Add a line to pip install the library to .travis_install.sh Add a line to print the version of the library to .travis_install.sh Similarly add a line to print the version of the library to .travis_test.sh After submitting your pull request After submitting your pull request, Travis-CI will automatically run unit tests on your changes and make sure that your updated code builds and runs on Python 2 and 3. We also use services that automatically check code quality and test coverage. Check back shortly after submitting your pull request to make sure that your code passes these checks. If any of the checks come back with a red X, then do your best to address the errors.","title":"Contributing"},{"location":"contributing/#contribution-guide","text":"We welcome you to check the existing issues for bugs or enhancements to work on. If you have an idea for an extension to TPOT, please file a new issue so we can discuss it.","title":"Contribution Guide"},{"location":"contributing/#project-layout","text":"The latest stable release of TPOT is on the master branch , whereas the latest version of TPOT in development is on the development branch . Make sure you are looking at and working on the correct branch if you're looking to contribute code. In terms of directory structure: All of TPOT's code sources are in the tpot directory The documentation sources are in the docs_sources directory Images in the documentation are in the images directory Tutorials for TPOT are in the tutorials directory Unit tests for TPOT are in the tests.py file Make sure to familiarize yourself with the project layout before making any major contributions, and especially make sure to send all code changes to the development branch.","title":"Project layout"},{"location":"contributing/#how-to-contribute","text":"The preferred way to contribute to TPOT is to fork the main repository on GitHub: Fork the project repository : click on the 'Fork' button near the top of the page. This creates a copy of the code under your account on the GitHub server. Clone this copy to your local disk: $ git clone git@github.com:YourUsername/tpot.git $ cd tpot Create a branch to hold your changes: $ git checkout -b my-contribution Make sure your local environment is setup correctly for development. Installation instructions are almost identical to the user instructions except that TPOT should not be installed. If you have TPOT installed on your computer then make sure you are using a virtual environment that does not have TPOT installed. Furthermore, you should make sure you have installed the nose package into your development environment so that you can test changes locally. $ conda install nose Start making changes on your newly created branch, remembering to never work on the master branch! Work on this copy on your computer using Git to do the version control. Once some changes are saved locally, you can use your tweaked version of TPOT by navigating to the project's base directory and running TPOT directly from the command line: $ python -m tpot.driver or by running script that imports and uses the TPOT module with code similar to from tpot import TPOTClassifier To check your changes haven't broken any existing tests and to check new tests you've added pass run the following (note, you must have the nose package installed within your dev environment for this to work): $ nosetests -s -v When you're done editing and local testing, run: $ git add modified_files $ git commit to record your changes in Git, then push them to GitHub with: $ git push -u origin my-contribution Finally, go to the web page of your fork of the TPOT repo, and click 'Pull Request' (PR) to send your changes to the maintainers for review. Make sure that you send your PR to the development branch, as the master branch is reserved for the latest stable release. This will start the CI server to check all the project's unit tests run and send an email to the maintainers. (If any of the above seems like magic to you, then look up the Git documentation on the web.)","title":"How to contribute"},{"location":"contributing/#before-submitting-your-pull-request","text":"Before you submit a pull request for your contribution, please work through this checklist to make sure that you have done everything necessary so we can efficiently review and accept your changes. If your contribution changes TPOT in any way: Update the documentation so all of your changes are reflected there. Update the README if anything there has changed. If your contribution involves any code changes: Update the project unit tests to test your code changes. Make sure that your code is properly commented with docstrings and comments explaining your rationale behind non-obvious coding practices. If your code affected any of the pipeline operators, make sure that the corresponding export functionality reflects those changes. If your contribution requires a new library dependency: Double-check that the new dependency is easy to install via pip or Anaconda and supports both Python 2 and 3. If the dependency requires a complicated installation, then we most likely won't merge your changes because we want to keep TPOT easy to install. Add the required version of the library to .travis.yml Add a line to pip install the library to .travis_install.sh Add a line to print the version of the library to .travis_install.sh Similarly add a line to print the version of the library to .travis_test.sh","title":"Before submitting your pull request"},{"location":"contributing/#after-submitting-your-pull-request","text":"After submitting your pull request, Travis-CI will automatically run unit tests on your changes and make sure that your updated code builds and runs on Python 2 and 3. We also use services that automatically check code quality and test coverage. Check back shortly after submitting your pull request to make sure that your code passes these checks. If any of the checks come back with a red X, then do your best to address the errors.","title":"After submitting your pull request"},{"location":"examples/","text":"Overview The following sections illustrate the usage of TPOT with various datasets, each belonging to a typical class of machine learning tasks. Dataset Task Task class Dataset description Jupyter notebook Iris flower classification classification link link Optical Recognition of Handwritten Digits digit recognition (image) classification link link Boston housing prices modeling regression link N/A Titanic survival analysis classification link link Bank Marketing subscription prediction classification link link MAGIC Gamma Telescope event detection classification link link Notes: - For details on how the fit() , score() and export() methods work, refer to the usage documentation . - Upon re-running the experiments, your resulting pipelines may differ (to some extent) from the ones demonstrated here. Iris flower classification The following code illustrates how TPOT can be employed for performing a simple classification task over the Iris dataset. from tpot import TPOTClassifier from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split import numpy as np iris = load_iris() X_train, X_test, y_train, y_test = train_test_split(iris.data.astype(np.float64), iris.target.astype(np.float64), train_size=0.75, test_size=0.25, random_state=42) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, random_state=42) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_iris_pipeline.py') Running this code should discover a pipeline (exported as tpot_iris_pipeline.py ) that achieves about 97% test accuracy: import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Normalizer from tpot.export_utils import set_param_recursive # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'], random_state=42) # Average CV score on the training set was: 0.9826086956521738 exported_pipeline = make_pipeline( Normalizer(norm=\"l2\"), KNeighborsClassifier(n_neighbors=5, p=2, weights=\"distance\") ) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 42) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) Digits dataset Below is a minimal working example with the optical recognition of handwritten digits dataset, which is an image classification problem . from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25, random_state=42) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, random_state=42) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Running this code should discover a pipeline (exported as tpot_digits_pipeline.py ) that achieves about 98% test accuracy: import numpy as np import pandas as pd from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import PolynomialFeatures from tpot.builtins import StackingEstimator from tpot.export_utils import set_param_recursive # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'], random_state=42) # Average CV score on the training set was: 0.9799428471757372 exported_pipeline = make_pipeline( PolynomialFeatures(degree=2, include_bias=False, interaction_only=False), StackingEstimator(estimator=LogisticRegression(C=0.1, dual=False, penalty=\"l1\")), RandomForestClassifier(bootstrap=True, criterion=\"entropy\", max_features=0.35000000000000003, min_samples_leaf=20, min_samples_split=19, n_estimators=100) ) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 42) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) Boston housing prices modeling The following code illustrates how TPOT can be employed for performing a regression task over the Boston housing prices dataset. from tpot import TPOTRegressor from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split housing = load_boston() X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, train_size=0.75, test_size=0.25, random_state=42) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2, random_state=42) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py') Running this code should discover a pipeline (exported as tpot_boston_pipeline.py ) that achieves at least 10 mean squared error (MSE) on the test set: import numpy as np import pandas as pd from sklearn.ensemble import ExtraTreesRegressor from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import PolynomialFeatures from tpot.export_utils import set_param_recursive # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'], random_state=42) # Average CV score on the training set was: -10.812040755234403 exported_pipeline = make_pipeline( PolynomialFeatures(degree=2, include_bias=False, interaction_only=False), ExtraTreesRegressor(bootstrap=False, max_features=0.5, min_samples_leaf=2, min_samples_split=3, n_estimators=100) ) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 42) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) Titanic survival analysis To see the TPOT applied the Titanic Kaggle dataset, see the Jupyter notebook here . This example shows how to take a messy dataset and preprocess it such that it can be used in scikit-learn and TPOT. Portuguese Bank Marketing The corresponding Jupyter notebook, containing the associated data preprocessing and analysis, can be found here . MAGIC Gamma Telescope The corresponding Jupyter notebook, containing the associated data preprocessing and analysis, can be found here . Neural network classifier using TPOT-NN By loading the TPOT-NN configuration dictionary , PyTorch estimators will be included for classification. Users can also create their own NN configuration dictionary that includes tpot.builtins.PytorchLRClassifier and/or tpot.builtins.PytorchMLPClassifier , or they can specify them using a template string, as shown in the following example: from tpot import TPOTClassifier from sklearn.datasets import make_blobs from sklearn.model_selection import train_test_split X, y = make_blobs(n_samples=100, centers=2, n_features=3, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25) clf = TPOTClassifier(config_dict='TPOT NN', template='Selector-Transformer-PytorchLRClassifier', verbosity=2, population_size=10, generations=10) clf.fit(X_train, y_train) print(clf.score(X_test, y_test)) clf.export('tpot_nn_demo_pipeline.py') This example is somewhat trivial, but it should result in nearly 100% classification accuracy.","title":"Examples"},{"location":"examples/#overview","text":"The following sections illustrate the usage of TPOT with various datasets, each belonging to a typical class of machine learning tasks. Dataset Task Task class Dataset description Jupyter notebook Iris flower classification classification link link Optical Recognition of Handwritten Digits digit recognition (image) classification link link Boston housing prices modeling regression link N/A Titanic survival analysis classification link link Bank Marketing subscription prediction classification link link MAGIC Gamma Telescope event detection classification link link Notes: - For details on how the fit() , score() and export() methods work, refer to the usage documentation . - Upon re-running the experiments, your resulting pipelines may differ (to some extent) from the ones demonstrated here.","title":"Overview"},{"location":"examples/#iris-flower-classification","text":"The following code illustrates how TPOT can be employed for performing a simple classification task over the Iris dataset. from tpot import TPOTClassifier from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split import numpy as np iris = load_iris() X_train, X_test, y_train, y_test = train_test_split(iris.data.astype(np.float64), iris.target.astype(np.float64), train_size=0.75, test_size=0.25, random_state=42) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, random_state=42) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_iris_pipeline.py') Running this code should discover a pipeline (exported as tpot_iris_pipeline.py ) that achieves about 97% test accuracy: import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Normalizer from tpot.export_utils import set_param_recursive # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'], random_state=42) # Average CV score on the training set was: 0.9826086956521738 exported_pipeline = make_pipeline( Normalizer(norm=\"l2\"), KNeighborsClassifier(n_neighbors=5, p=2, weights=\"distance\") ) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 42) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)","title":"Iris flower classification"},{"location":"examples/#digits-dataset","text":"Below is a minimal working example with the optical recognition of handwritten digits dataset, which is an image classification problem . from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25, random_state=42) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, random_state=42) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Running this code should discover a pipeline (exported as tpot_digits_pipeline.py ) that achieves about 98% test accuracy: import numpy as np import pandas as pd from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import PolynomialFeatures from tpot.builtins import StackingEstimator from tpot.export_utils import set_param_recursive # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'], random_state=42) # Average CV score on the training set was: 0.9799428471757372 exported_pipeline = make_pipeline( PolynomialFeatures(degree=2, include_bias=False, interaction_only=False), StackingEstimator(estimator=LogisticRegression(C=0.1, dual=False, penalty=\"l1\")), RandomForestClassifier(bootstrap=True, criterion=\"entropy\", max_features=0.35000000000000003, min_samples_leaf=20, min_samples_split=19, n_estimators=100) ) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 42) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)","title":"Digits dataset"},{"location":"examples/#boston-housing-prices-modeling","text":"The following code illustrates how TPOT can be employed for performing a regression task over the Boston housing prices dataset. from tpot import TPOTRegressor from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split housing = load_boston() X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, train_size=0.75, test_size=0.25, random_state=42) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2, random_state=42) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py') Running this code should discover a pipeline (exported as tpot_boston_pipeline.py ) that achieves at least 10 mean squared error (MSE) on the test set: import numpy as np import pandas as pd from sklearn.ensemble import ExtraTreesRegressor from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import PolynomialFeatures from tpot.export_utils import set_param_recursive # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'], random_state=42) # Average CV score on the training set was: -10.812040755234403 exported_pipeline = make_pipeline( PolynomialFeatures(degree=2, include_bias=False, interaction_only=False), ExtraTreesRegressor(bootstrap=False, max_features=0.5, min_samples_leaf=2, min_samples_split=3, n_estimators=100) ) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 42) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)","title":"Boston housing prices modeling"},{"location":"examples/#titanic-survival-analysis","text":"To see the TPOT applied the Titanic Kaggle dataset, see the Jupyter notebook here . This example shows how to take a messy dataset and preprocess it such that it can be used in scikit-learn and TPOT.","title":"Titanic survival analysis"},{"location":"examples/#portuguese-bank-marketing","text":"The corresponding Jupyter notebook, containing the associated data preprocessing and analysis, can be found here .","title":"Portuguese Bank Marketing"},{"location":"examples/#magic-gamma-telescope","text":"The corresponding Jupyter notebook, containing the associated data preprocessing and analysis, can be found here .","title":"MAGIC Gamma Telescope"},{"location":"examples/#neural-network-classifier-using-tpot-nn","text":"By loading the TPOT-NN configuration dictionary , PyTorch estimators will be included for classification. Users can also create their own NN configuration dictionary that includes tpot.builtins.PytorchLRClassifier and/or tpot.builtins.PytorchMLPClassifier , or they can specify them using a template string, as shown in the following example: from tpot import TPOTClassifier from sklearn.datasets import make_blobs from sklearn.model_selection import train_test_split X, y = make_blobs(n_samples=100, centers=2, n_features=3, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25) clf = TPOTClassifier(config_dict='TPOT NN', template='Selector-Transformer-PytorchLRClassifier', verbosity=2, population_size=10, generations=10) clf.fit(X_train, y_train) print(clf.score(X_test, y_test)) clf.export('tpot_nn_demo_pipeline.py') This example is somewhat trivial, but it should result in nearly 100% classification accuracy.","title":"Neural network classifier using TPOT-NN"},{"location":"installing/","text":"Installation TPOT is built on top of several existing Python libraries, including: NumPy SciPy scikit-learn DEAP update_checker tqdm stopit pandas joblib Most of the necessary Python packages can be installed via the Anaconda Python distribution , which we strongly recommend that you use. Support for Python 3.4 and below has been officially dropped since version 0.11.0. You can install TPOT using pip or conda-forge . pip NumPy, SciPy, scikit-learn, pandas, joblib, and PyTorch can be installed in Anaconda via the command: conda install numpy scipy scikit-learn pandas joblib pytorch DEAP, update_checker, tqdm and stopit can be installed with pip via the command: pip install deap update_checker tqdm stopit Optionally , you can install XGBoost if you would like TPOT to use the eXtreme Gradient Boosting models. XGBoost is entirely optional, and TPOT will still function normally without XGBoost if you do not have it installed. Windows users: pip installation may not work on some Windows environments, and it may cause unexpected errors. pip install xgboost If you have issues installing XGBoost, check the XGBoost installation documentation . If you plan to use Dask for parallel training, make sure to install dask[delay] and dask[dataframe] and dask_ml . pip install dask[delayed] dask[dataframe] dask-ml fsspec>=0.3.3 If you plan to use the TPOT-MDR configuration , make sure to install scikit-mdr and scikit-rebate : pip install scikit-mdr skrebate To enable support for PyTorch -based neural networks (TPOT-NN), you will need to install PyTorch. TPOT-NN will work with either CPU or GPU PyTorch, but we strongly recommend using a GPU version, if possible, as CPU PyTorch models tend to train very slowly. We recommend following PyTorch's installation instructions customized for your operating system and Python distribution. Finally to install TPOT itself, run the following command: pip install tpot conda-forge To install tpot and its core dependencies you can use: conda install -c conda-forge tpot To install additional dependencies you can use: conda install -c conda-forge tpot xgboost dask dask-ml scikit-mdr skrebate As mentioned above, we recommend following PyTorch's installation instructions for installing it to enable support for PyTorch -based neural networks (TPOT-NN). Installation problems Please file a new issue if you run into installation problems.","title":"Installation"},{"location":"installing/#installation","text":"TPOT is built on top of several existing Python libraries, including: NumPy SciPy scikit-learn DEAP update_checker tqdm stopit pandas joblib Most of the necessary Python packages can be installed via the Anaconda Python distribution , which we strongly recommend that you use. Support for Python 3.4 and below has been officially dropped since version 0.11.0. You can install TPOT using pip or conda-forge .","title":"Installation"},{"location":"installing/#pip","text":"NumPy, SciPy, scikit-learn, pandas, joblib, and PyTorch can be installed in Anaconda via the command: conda install numpy scipy scikit-learn pandas joblib pytorch DEAP, update_checker, tqdm and stopit can be installed with pip via the command: pip install deap update_checker tqdm stopit Optionally , you can install XGBoost if you would like TPOT to use the eXtreme Gradient Boosting models. XGBoost is entirely optional, and TPOT will still function normally without XGBoost if you do not have it installed. Windows users: pip installation may not work on some Windows environments, and it may cause unexpected errors. pip install xgboost If you have issues installing XGBoost, check the XGBoost installation documentation . If you plan to use Dask for parallel training, make sure to install dask[delay] and dask[dataframe] and dask_ml . pip install dask[delayed] dask[dataframe] dask-ml fsspec>=0.3.3 If you plan to use the TPOT-MDR configuration , make sure to install scikit-mdr and scikit-rebate : pip install scikit-mdr skrebate To enable support for PyTorch -based neural networks (TPOT-NN), you will need to install PyTorch. TPOT-NN will work with either CPU or GPU PyTorch, but we strongly recommend using a GPU version, if possible, as CPU PyTorch models tend to train very slowly. We recommend following PyTorch's installation instructions customized for your operating system and Python distribution. Finally to install TPOT itself, run the following command: pip install tpot","title":"pip"},{"location":"installing/#conda-forge","text":"To install tpot and its core dependencies you can use: conda install -c conda-forge tpot To install additional dependencies you can use: conda install -c conda-forge tpot xgboost dask dask-ml scikit-mdr skrebate As mentioned above, we recommend following PyTorch's installation instructions for installing it to enable support for PyTorch -based neural networks (TPOT-NN).","title":"conda-forge"},{"location":"installing/#installation-problems","text":"Please file a new issue if you run into installation problems.","title":"Installation problems"},{"location":"related/","text":"Other Automated Machine Learning (AutoML) tools and related projects: Name Language License Description Auto-WEKA Java GPL-v3 Automated model selection and hyper-parameter tuning for Weka models. auto-sklearn Python BSD-3-Clause An automated machine learning toolkit and a drop-in replacement for a scikit-learn estimator. auto_ml Python MIT Automated machine learning for analytics & production. Supports manual feature type declarations. H2O AutoML Java with Python, Scala & R APIs and web GUI Apache 2.0 Automated: data prep, hyperparameter tuning, random grid search and stacked ensembles in a distributed ML platform. devol Python MIT Automated deep neural network design via genetic programming. MLBox Python BSD-3-Clause Accurate hyper-parameter optimization in high-dimensional space with support for distributed computing. Recipe C GPL-v3 Machine-learning pipeline optimization through genetic programming. Uses grammars to define pipeline structure. Xcessiv Python Apache 2.0 A web-based application for quick, scalable, and automated hyper-parameter tuning and stacked ensembling in Python. GAMA Python Apache 2.0 Machine-learning pipeline optimization through asynchronous evaluation based genetic programming.","title":"Related"},{"location":"releases/","text":"Release Notes Version 0.11.2 Fix early_stop parameter does not work properly TPOT built-in OneHotEncoder can refit to different datasets Fix the issue that the attribute evaluated_individuals_ cannot record correct generation info. Add a new parameter log_file to output logs to a file instead of sys.stdout Fix some code quality issues and mistakes in documentations Fix minor bugs Version 0.11.1 Fix compatibility issue with scikit-learn v0.22 warm_start now saves both Primitive Sets and evaluated_pipelines_ from previous runs; Fix the error that TPOT assign wrong fitness scores to non-evaluated pipelines (interrupted by max_min_mins or KeyboardInterrupt ) ; Fix the bug that mutation operator cannot generate new pipeline when template is not default value and warm_start is True; Fix the bug that max_time_mins cannot stop optimization process when search space is limited. Fix a bug in exported codes when the exported pipeline is only 1 estimator Fix spelling mistakes in documentations Fix some code quality issues Version 0.11.0 Support for Python 3.4 and below has been officially dropped. Also support for scikit-learn 0.20 or below has been dropped. The support of a metric function with the signature score_func(y_true, y_pred) for scoring parameter has been dropped. Refine StackingEstimator for not stacking NaN/Infinity predication probabilities. Fix a bug that population doesn't persist by warm_start=True when max_time_mins is not default value. Now the random_state parameter in TPOT is used for pipeline evaluation instead of using a fixed random seed of 42 before. The set_param_recursive function has been moved to export_utils.py and it can be used in exported codes for setting random_state recursively in scikit-learn Pipeline. It is used to set random_state in fitted_pipeline_ attribute and exported pipelines. TPOT can independently use generations and max_time_mins to limit the optimization process through using one of the parameters or both. .export() function will return string of exported pipeline if output filename is not specified. Add SGDClassifier and SGDRegressor into TPOT default configs. Documentation has been updated Fix minor bugs. Version 0.10.2 TPOT v0.10.2 is the last version to support Python 2.7 and Python 3.4. Minor updates for fixing compatibility issues with the latest version of scikit-learn (version > 0.21) and xgboost (v0.90) Default value of template parameter is changed to None instead. Fix errors in documentation Version 0.10.1 Add data_file_path option into expert function for replacing 'PATH/TO/DATA/FILE' to customized dataset path in exported scripts. (Related issue #838) Change python version in CI tests to 3.7 Add CI tests for macOS. Version 0.10.0 Add a new template option to specify a desired structure for machine learning pipeline in TPOT. Check TPOT API (it will be updated once it is merge to master branch). Add FeatureSetSelector operator into TPOT for feature selection based on priori export knowledge. Please check our preprint paper for more details ( Note: it was named DatasetSelector in 1st version paper but we will rename to FeatureSetSelector in next version of the paper ) Refine n_jobs parameter to accept value below -1. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Now memory parameter can create memory cache directory if it does not exist. Fix minor bugs. Version 0.9.6 Fix a bug causing that max_time_mins parameter doesn't work when use_dask=True in TPOT 0.9.5 Now TPOT saves best pareto values best pareto pipeline s in checkpoint folder TPOT raises ImportError if operators in the TPOT configuration are not available when verbosity>2 Thank @PGijsbers for the suggestions. Now TPOT can save scores of individuals already evaluated in any generation even the evaluation process of that generation is interrupted/stopped. But it is noted that, in this case, TPOT will raise this warning message : WARNING: TPOT may not provide a good pipeline if TPOT is stopped/interrupted in a early generation. , because the pipelines in early generation, e.g. 1st generation, are evolved/modified very limited times via evolutionary algorithm. Fix bugs in configuration of TPOTRegressor Error fixes in documentation Version 0.9.5 TPOT now supports integration with Dask for parallelization + smart caching . Big thanks to the Dask dev team for making this happen! TPOT now supports for imputation/sparse matrices into predict and predict_proba functions. TPOTClassifier and TPOTRegressor now follows scikit-learn estimator API. We refined scoring parameter in TPOT API for accepting Scorer object . We refined parameters in VarianceThreshold and FeatureAgglomeration. TPOT now supports using memory caching within a Pipeline via a optional memory parameter. We improved documentation of TPOT. Version 0.9 TPOT now supports sparse matrices with a new built-in TPOT configuration, \"TPOT sparse\". We are using a custom OneHotEncoder implementation that supports missing values and continuous features. We have added an \"early stopping\" option for stopping the optimization process if no improvement is made within a set number of generations. Look up the early_stop parameter to access this functionality. TPOT now reduces the number of duplicated pipelines between generations, which saves you time during the optimization process. TPOT now supports custom scoring functions via the command-line mode. We have added a new optional argument, periodic_checkpoint_folder , that allows TPOT to periodically save the best pipeline so far to a local folder during optimization process. TPOT no longer uses sklearn.externals.joblib when n_jobs=1 to avoid the potential freezing issue that scikit-learn suffers from . We have added pandas as a dependency to read input datasets instead of numpy.recfromcsv . NumPy's recfromcsv function is unable to parse datasets with complex data types. Fixed a bug that DEFAULT in the parameter(s) of nested estimator raises KeyError when exporting pipelines. Fixed a bug related to setting random_state in nested estimators. The issue would happen with pipeline with SelectFromModel ( ExtraTreesClassifier as nested estimator) or StackingEstimator if nested estimator has random_state parameter. Fixed a bug in the missing value imputation function in TPOT to impute along columns instead rows. Refined input checking for sparse matrices in TPOT. Refined the TPOT pipeline mutation operator. Version 0.8 TPOT now detects whether there are missing values in your dataset and replaces them with the median value of the column. TPOT now allows you to set a group parameter in the fit function so you can use the GroupKFold cross-validation strategy. TPOT now allows you to set a subsample ratio of the training instance with the subsample parameter. For example, setting subsample =0.5 tells TPOT to create a fixed subsample of half of the training data for the pipeline optimization process. This parameter can be useful for speeding up the pipeline optimization process, but may give less accurate performance estimates from cross-validation. TPOT now has more built-in configurations , including TPOT MDR and TPOT light, for both classification and regression problems. TPOTClassifier and TPOTRegressor now expose three useful internal attributes, fitted_pipeline_ , pareto_front_fitted_pipelines_ , and evaluated_individuals_ . These attributes are described in the API documentation . Oh, TPOT now has thorough API documentation . Check it out! Fixed a reproducibility issue where setting random_seed didn't necessarily result in the same results every time. This bug was present since TPOT v0.7. Refined input checking in TPOT. Removed Python 2 uncompliant code. Version 0.7 TPOT now has multiprocessing support. TPOT allows you to use multiple processes in parallel to accelerate the pipeline optimization process in TPOT with the n_jobs parameter. TPOT now allows you to customize the operators and parameters considered during the optimization process , which can be accomplished with the new config_dict parameter. The format of this customized dictionary can be found in the online documentation , along with a list of built-in configurations . TPOT now allows you to specify a time limit for evaluating a single pipeline (default limit is 5 minutes) in optimization process with the max_eval_time_mins parameter, so TPOT won't spend hours evaluating overly-complex pipelines. We tweaked TPOT's underlying evolutionary optimization algorithm to work even better, including using the mu+lambda algorithm . This algorithm gives you more control of how many pipelines are generated every iteration with the offspring_size parameter. Refined the default operators and parameters in TPOT, so TPOT 0.7 should work even better than 0.6. TPOT now supports sample weights in the fitness function if some if your samples are more important to classify correctly than others. The sample weights option works the same as in scikit-learn, e.g., tpot.fit(x_train, y_train, sample_weights=sample_weights) . The default scoring metric in TPOT has been changed from balanced accuracy to accuracy, the same default metric for classification algorithms in scikit-learn. Balanced accuracy can still be used by setting scoring='balanced_accuracy' when creating a TPOT instance. Version 0.6 TPOT now supports regression problems! We have created two separate TPOTClassifier and TPOTRegressor classes to support classification and regression problems, respectively. The command-line interface also supports this feature through the -mode parameter. TPOT now allows you to specify a time limit for the optimization process with the max_time_mins parameter, so you don't need to guess how long TPOT will take any more to recommend a pipeline to you. Added a new operator that performs feature selection using ExtraTrees feature importance scores. XGBoost has been added as an optional dependency to TPOT. If you have XGBoost installed, TPOT will automatically detect your installation and use the XGBoostClassifier and XGBoostRegressor in its pipelines. TPOT now offers a verbosity level of 3 (\"science mode\"), which outputs the entire Pareto front instead of only the current best score. This feature may be useful for users looking to make a trade-off between pipeline complexity and score. Version 0.5 Major refactor: Each operator is defined in a separate class file. Hooray for easier-to-maintain code! TPOT now exports directly to scikit-learn Pipelines instead of hacky code. Internal representation of individuals now uses scikit-learn pipelines. Parameters for each operator have been optimized so TPOT spends less time exploring useless parameters. We have removed pandas as a dependency and instead use numpy matrices to store the data. TPOT now uses k-fold cross-validation when evaluating pipelines, with a default k = 3. This k parameter can be tuned when creating a new TPOT instance. Improved scoring function support : Even though TPOT uses balanced accuracy by default, you can now have TPOT use any of the scoring functions that cross_val_score supports. Added the scikit-learn Normalizer preprocessor. Minor text fixes. Version 0.4 In TPOT 0.4, we've made some major changes to the internals of TPOT and added some convenience functions. We've summarized the changes below. Added new sklearn models and preprocessors AdaBoostClassifier BernoulliNB ExtraTreesClassifier GaussianNB MultinomialNB LinearSVC PassiveAggressiveClassifier GradientBoostingClassifier RBFSampler FastICA FeatureAgglomeration Nystroem Added operator that inserts virtual features for the count of features with values of zero Reworked parameterization of TPOT operators Reduced parameter search space with information from a scikit-learn benchmark TPOT no longer generates arbitrary parameter values, but uses a fixed parameter set instead Removed XGBoost as a dependency Too many users were having install issues with XGBoost Replaced with scikit-learn's GradientBoostingClassifier Improved descriptiveness of TPOT command line parameter documentation Removed min/max/avg details during fit() when verbosity > 1 Replaced with tqdm progress bar Added tqdm as a dependency Added fit_predict() convenience function Added get_params() function so TPOT can operate in scikit-learn's cross_val_score & related functions Version 0.3 We revised the internal optimization process of TPOT to make it more efficient, in particular in regards to the model parameters that TPOT optimizes over. Version 0.2 TPOT now has the ability to export the optimized pipelines to sklearn code. Logistic regression, SVM, and k-nearest neighbors classifiers were added as pipeline operators. Previously, TPOT only included decision tree and random forest classifiers. TPOT can now use arbitrary scoring functions for the optimization process. TPOT now performs multi-objective Pareto optimization to balance model complexity (i.e., # of pipeline operators) and the score of the pipeline. Version 0.1 First public release of TPOT. Optimizes pipelines with decision trees and random forest classifiers as the model, and uses a handful of feature preprocessors.","title":"Release Notes"},{"location":"releases/#release-notes","text":"","title":"Release Notes"},{"location":"releases/#version-0112","text":"Fix early_stop parameter does not work properly TPOT built-in OneHotEncoder can refit to different datasets Fix the issue that the attribute evaluated_individuals_ cannot record correct generation info. Add a new parameter log_file to output logs to a file instead of sys.stdout Fix some code quality issues and mistakes in documentations Fix minor bugs","title":"Version 0.11.2"},{"location":"releases/#version-0111","text":"Fix compatibility issue with scikit-learn v0.22 warm_start now saves both Primitive Sets and evaluated_pipelines_ from previous runs; Fix the error that TPOT assign wrong fitness scores to non-evaluated pipelines (interrupted by max_min_mins or KeyboardInterrupt ) ; Fix the bug that mutation operator cannot generate new pipeline when template is not default value and warm_start is True; Fix the bug that max_time_mins cannot stop optimization process when search space is limited. Fix a bug in exported codes when the exported pipeline is only 1 estimator Fix spelling mistakes in documentations Fix some code quality issues","title":"Version 0.11.1"},{"location":"releases/#version-0110","text":"Support for Python 3.4 and below has been officially dropped. Also support for scikit-learn 0.20 or below has been dropped. The support of a metric function with the signature score_func(y_true, y_pred) for scoring parameter has been dropped. Refine StackingEstimator for not stacking NaN/Infinity predication probabilities. Fix a bug that population doesn't persist by warm_start=True when max_time_mins is not default value. Now the random_state parameter in TPOT is used for pipeline evaluation instead of using a fixed random seed of 42 before. The set_param_recursive function has been moved to export_utils.py and it can be used in exported codes for setting random_state recursively in scikit-learn Pipeline. It is used to set random_state in fitted_pipeline_ attribute and exported pipelines. TPOT can independently use generations and max_time_mins to limit the optimization process through using one of the parameters or both. .export() function will return string of exported pipeline if output filename is not specified. Add SGDClassifier and SGDRegressor into TPOT default configs. Documentation has been updated Fix minor bugs.","title":"Version 0.11.0"},{"location":"releases/#version-0102","text":"TPOT v0.10.2 is the last version to support Python 2.7 and Python 3.4. Minor updates for fixing compatibility issues with the latest version of scikit-learn (version > 0.21) and xgboost (v0.90) Default value of template parameter is changed to None instead. Fix errors in documentation","title":"Version 0.10.2"},{"location":"releases/#version-0101","text":"Add data_file_path option into expert function for replacing 'PATH/TO/DATA/FILE' to customized dataset path in exported scripts. (Related issue #838) Change python version in CI tests to 3.7 Add CI tests for macOS.","title":"Version 0.10.1"},{"location":"releases/#version-0100","text":"Add a new template option to specify a desired structure for machine learning pipeline in TPOT. Check TPOT API (it will be updated once it is merge to master branch). Add FeatureSetSelector operator into TPOT for feature selection based on priori export knowledge. Please check our preprint paper for more details ( Note: it was named DatasetSelector in 1st version paper but we will rename to FeatureSetSelector in next version of the paper ) Refine n_jobs parameter to accept value below -1. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Now memory parameter can create memory cache directory if it does not exist. Fix minor bugs.","title":"Version 0.10.0"},{"location":"releases/#version-096","text":"Fix a bug causing that max_time_mins parameter doesn't work when use_dask=True in TPOT 0.9.5 Now TPOT saves best pareto values best pareto pipeline s in checkpoint folder TPOT raises ImportError if operators in the TPOT configuration are not available when verbosity>2 Thank @PGijsbers for the suggestions. Now TPOT can save scores of individuals already evaluated in any generation even the evaluation process of that generation is interrupted/stopped. But it is noted that, in this case, TPOT will raise this warning message : WARNING: TPOT may not provide a good pipeline if TPOT is stopped/interrupted in a early generation. , because the pipelines in early generation, e.g. 1st generation, are evolved/modified very limited times via evolutionary algorithm. Fix bugs in configuration of TPOTRegressor Error fixes in documentation","title":"Version 0.9.6"},{"location":"releases/#version-095","text":"TPOT now supports integration with Dask for parallelization + smart caching . Big thanks to the Dask dev team for making this happen! TPOT now supports for imputation/sparse matrices into predict and predict_proba functions. TPOTClassifier and TPOTRegressor now follows scikit-learn estimator API. We refined scoring parameter in TPOT API for accepting Scorer object . We refined parameters in VarianceThreshold and FeatureAgglomeration. TPOT now supports using memory caching within a Pipeline via a optional memory parameter. We improved documentation of TPOT.","title":"Version 0.9.5"},{"location":"releases/#version-09","text":"TPOT now supports sparse matrices with a new built-in TPOT configuration, \"TPOT sparse\". We are using a custom OneHotEncoder implementation that supports missing values and continuous features. We have added an \"early stopping\" option for stopping the optimization process if no improvement is made within a set number of generations. Look up the early_stop parameter to access this functionality. TPOT now reduces the number of duplicated pipelines between generations, which saves you time during the optimization process. TPOT now supports custom scoring functions via the command-line mode. We have added a new optional argument, periodic_checkpoint_folder , that allows TPOT to periodically save the best pipeline so far to a local folder during optimization process. TPOT no longer uses sklearn.externals.joblib when n_jobs=1 to avoid the potential freezing issue that scikit-learn suffers from . We have added pandas as a dependency to read input datasets instead of numpy.recfromcsv . NumPy's recfromcsv function is unable to parse datasets with complex data types. Fixed a bug that DEFAULT in the parameter(s) of nested estimator raises KeyError when exporting pipelines. Fixed a bug related to setting random_state in nested estimators. The issue would happen with pipeline with SelectFromModel ( ExtraTreesClassifier as nested estimator) or StackingEstimator if nested estimator has random_state parameter. Fixed a bug in the missing value imputation function in TPOT to impute along columns instead rows. Refined input checking for sparse matrices in TPOT. Refined the TPOT pipeline mutation operator.","title":"Version 0.9"},{"location":"releases/#version-08","text":"TPOT now detects whether there are missing values in your dataset and replaces them with the median value of the column. TPOT now allows you to set a group parameter in the fit function so you can use the GroupKFold cross-validation strategy. TPOT now allows you to set a subsample ratio of the training instance with the subsample parameter. For example, setting subsample =0.5 tells TPOT to create a fixed subsample of half of the training data for the pipeline optimization process. This parameter can be useful for speeding up the pipeline optimization process, but may give less accurate performance estimates from cross-validation. TPOT now has more built-in configurations , including TPOT MDR and TPOT light, for both classification and regression problems. TPOTClassifier and TPOTRegressor now expose three useful internal attributes, fitted_pipeline_ , pareto_front_fitted_pipelines_ , and evaluated_individuals_ . These attributes are described in the API documentation . Oh, TPOT now has thorough API documentation . Check it out! Fixed a reproducibility issue where setting random_seed didn't necessarily result in the same results every time. This bug was present since TPOT v0.7. Refined input checking in TPOT. Removed Python 2 uncompliant code.","title":"Version 0.8"},{"location":"releases/#version-07","text":"TPOT now has multiprocessing support. TPOT allows you to use multiple processes in parallel to accelerate the pipeline optimization process in TPOT with the n_jobs parameter. TPOT now allows you to customize the operators and parameters considered during the optimization process , which can be accomplished with the new config_dict parameter. The format of this customized dictionary can be found in the online documentation , along with a list of built-in configurations . TPOT now allows you to specify a time limit for evaluating a single pipeline (default limit is 5 minutes) in optimization process with the max_eval_time_mins parameter, so TPOT won't spend hours evaluating overly-complex pipelines. We tweaked TPOT's underlying evolutionary optimization algorithm to work even better, including using the mu+lambda algorithm . This algorithm gives you more control of how many pipelines are generated every iteration with the offspring_size parameter. Refined the default operators and parameters in TPOT, so TPOT 0.7 should work even better than 0.6. TPOT now supports sample weights in the fitness function if some if your samples are more important to classify correctly than others. The sample weights option works the same as in scikit-learn, e.g., tpot.fit(x_train, y_train, sample_weights=sample_weights) . The default scoring metric in TPOT has been changed from balanced accuracy to accuracy, the same default metric for classification algorithms in scikit-learn. Balanced accuracy can still be used by setting scoring='balanced_accuracy' when creating a TPOT instance.","title":"Version 0.7"},{"location":"releases/#version-06","text":"TPOT now supports regression problems! We have created two separate TPOTClassifier and TPOTRegressor classes to support classification and regression problems, respectively. The command-line interface also supports this feature through the -mode parameter. TPOT now allows you to specify a time limit for the optimization process with the max_time_mins parameter, so you don't need to guess how long TPOT will take any more to recommend a pipeline to you. Added a new operator that performs feature selection using ExtraTrees feature importance scores. XGBoost has been added as an optional dependency to TPOT. If you have XGBoost installed, TPOT will automatically detect your installation and use the XGBoostClassifier and XGBoostRegressor in its pipelines. TPOT now offers a verbosity level of 3 (\"science mode\"), which outputs the entire Pareto front instead of only the current best score. This feature may be useful for users looking to make a trade-off between pipeline complexity and score.","title":"Version 0.6"},{"location":"releases/#version-05","text":"Major refactor: Each operator is defined in a separate class file. Hooray for easier-to-maintain code! TPOT now exports directly to scikit-learn Pipelines instead of hacky code. Internal representation of individuals now uses scikit-learn pipelines. Parameters for each operator have been optimized so TPOT spends less time exploring useless parameters. We have removed pandas as a dependency and instead use numpy matrices to store the data. TPOT now uses k-fold cross-validation when evaluating pipelines, with a default k = 3. This k parameter can be tuned when creating a new TPOT instance. Improved scoring function support : Even though TPOT uses balanced accuracy by default, you can now have TPOT use any of the scoring functions that cross_val_score supports. Added the scikit-learn Normalizer preprocessor. Minor text fixes.","title":"Version 0.5"},{"location":"releases/#version-04","text":"In TPOT 0.4, we've made some major changes to the internals of TPOT and added some convenience functions. We've summarized the changes below. Added new sklearn models and preprocessors AdaBoostClassifier BernoulliNB ExtraTreesClassifier GaussianNB MultinomialNB LinearSVC PassiveAggressiveClassifier GradientBoostingClassifier RBFSampler FastICA FeatureAgglomeration Nystroem Added operator that inserts virtual features for the count of features with values of zero Reworked parameterization of TPOT operators Reduced parameter search space with information from a scikit-learn benchmark TPOT no longer generates arbitrary parameter values, but uses a fixed parameter set instead Removed XGBoost as a dependency Too many users were having install issues with XGBoost Replaced with scikit-learn's GradientBoostingClassifier Improved descriptiveness of TPOT command line parameter documentation Removed min/max/avg details during fit() when verbosity > 1 Replaced with tqdm progress bar Added tqdm as a dependency Added fit_predict() convenience function Added get_params() function so TPOT can operate in scikit-learn's cross_val_score & related functions","title":"Version 0.4"},{"location":"releases/#version-03","text":"We revised the internal optimization process of TPOT to make it more efficient, in particular in regards to the model parameters that TPOT optimizes over.","title":"Version 0.3"},{"location":"releases/#version-02","text":"TPOT now has the ability to export the optimized pipelines to sklearn code. Logistic regression, SVM, and k-nearest neighbors classifiers were added as pipeline operators. Previously, TPOT only included decision tree and random forest classifiers. TPOT can now use arbitrary scoring functions for the optimization process. TPOT now performs multi-objective Pareto optimization to balance model complexity (i.e., # of pipeline operators) and the score of the pipeline.","title":"Version 0.2"},{"location":"releases/#version-01","text":"First public release of TPOT. Optimizes pipelines with decision trees and random forest classifiers as the model, and uses a handful of feature preprocessors.","title":"Version 0.1"},{"location":"support/","text":"TPOT was developed in the Computational Genetics Lab at the University of Pennsylvania with funding from the NIH under grant R01 AI117694. We are incredibly grateful for the support of the NIH and the University of Pennsylvania during the development of this project. The TPOT logo was designed by Todd Newmuis, who generously donated his time to the project.","title":"Support"},{"location":"using/","text":"Using TPOT What to expect from AutoML software Automated machine learning (AutoML) takes a higher-level approach to machine learning than most practitioners are used to, so we've gathered a handful of guidelines on what to expect when running AutoML software such as TPOT. AutoML algorithms aren't intended to run for only a few minutes Of course, you can run TPOT for only a few minutes and it will find a reasonably good pipeline for your dataset. However, if you don't run TPOT for long enough, it may not find the best possible pipeline for your dataset. It may even not find any suitable pipeline at all, in which case a RuntimeError('A pipeline has not yet been optimized. Please call fit() first.') will be raised. Often it is worthwhile to run multiple instances of TPOT in parallel for a long time (hours to days) to allow TPOT to thoroughly search the pipeline space for your dataset. AutoML algorithms can take a long time to finish their search AutoML algorithms aren't as simple as fitting one model on the dataset; they are considering multiple machine learning algorithms (random forests, linear models, SVMs, etc.) in a pipeline with multiple preprocessing steps (missing value imputation, scaling, PCA, feature selection, etc.), the hyperparameters for all of the models and preprocessing steps, as well as multiple ways to ensemble or stack the algorithms within the pipeline. As such, TPOT will take a while to run on larger datasets, but it's important to realize why. With the default TPOT settings (100 generations with 100 population size), TPOT will evaluate 10,000 pipeline configurations before finishing. To put this number into context, think about a grid search of 10,000 hyperparameter combinations for a machine learning algorithm and how long that grid search will take. That is 10,000 model configurations to evaluate with 10-fold cross-validation, which means that roughly 100,000 models are fit and evaluated on the training data in one grid search. That's a time-consuming procedure, even for simpler models like decision trees. Typical TPOT runs will take hours to days to finish (unless it's a small dataset), but you can always interrupt the run partway through and see the best results so far. TPOT also provides a warm_start parameter that lets you restart a TPOT run from where it left off. AutoML algorithms can recommend different solutions for the same dataset If you're working with a reasonably complex dataset or run TPOT for a short amount of time, different TPOT runs may result in different pipeline recommendations. TPOT's optimization algorithm is stochastic in nature, which means that it uses randomness (in part) to search the possible pipeline space. When two TPOT runs recommend different pipelines, this means that the TPOT runs didn't converge due to lack of time or that multiple pipelines perform more-or-less the same on your dataset. This is actually an advantage over fixed grid search techniques: TPOT is meant to be an assistant that gives you ideas on how to solve a particular machine learning problem by exploring pipeline configurations that you might have never considered, then leaves the fine-tuning to more constrained parameter tuning techniques such as grid search. TPOT with code We've taken care to design the TPOT interface to be as similar as possible to scikit-learn. TPOT can be imported just like any regular Python module. To import TPOT, type: from tpot import TPOTClassifier then create an instance of TPOT as follows: pipeline_optimizer = TPOTClassifier() It's also possible to use TPOT for regression problems with the TPOTRegressor class. Other than the class name, a TPOTRegressor is used the same way as a TPOTClassifier . You can read more about the TPOTClassifier and TPOTRegressor classes in the API documentation . Some example code with custom TPOT parameters might look like: pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) Now TPOT is ready to optimize a pipeline for you. You can tell TPOT to optimize a pipeline based on a data set with the fit function: pipeline_optimizer.fit(X_train, y_train) The fit function initializes the genetic programming algorithm to find the highest-scoring pipeline based on average k-fold cross-validation Then, the pipeline is trained on the entire set of provided samples, and the TPOT instance can be used as a fitted model. You can then proceed to evaluate the final pipeline on the testing set with the score function: print(pipeline_optimizer.score(X_test, y_test)) Finally, you can tell TPOT to export the corresponding Python code for the optimized pipeline to a text file with the export function: pipeline_optimizer.export('tpot_exported_pipeline.py') Once this code finishes running, tpot_exported_pipeline.py will contain the Python code for the optimized pipeline. Below is a full example script using TPOT to optimize a pipeline, score it, and export the best pipeline to a file. from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) pipeline_optimizer.fit(X_train, y_train) print(pipeline_optimizer.score(X_test, y_test)) pipeline_optimizer.export('tpot_exported_pipeline.py') Check our examples to see TPOT applied to some specific data sets. TPOT on the command line To use TPOT via the command line, enter the following command with a path to the data file: tpot /path_to/data_file.csv An example command-line call to TPOT may look like: tpot data/mnist.csv -is , -target class -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2 TPOT offers several arguments that can be provided at the command line. To see brief descriptions of these arguments, enter the following command: tpot --help Detailed descriptions of the command-line arguments are below. Argument Parameter Valid values Effect -is INPUT_SEPARATOR Any string Character used to separate columns in the input file. -target TARGET_NAME Any string Name of the target column in the input file. -mode TPOT_MODE ['classification', 'regression'] Whether TPOT is being used for a supervised classification or regression problem. -o OUTPUT_FILE String path to a file File to export the code for the final optimized pipeline. -g GENERATIONS Any positive integer or None Number of iterations to run the pipeline optimization process. It must be a positive number or None. If None, the parameter max_time_mins must be defined as the runtime limit. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -p POPULATION_SIZE Any positive integer Number of individuals to retain in the GP population every generation. Generally, TPOT will work better when you give it more individuals (and therefore time) to optimize the pipeline. TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -os OFFSPRING_SIZE Any positive integer Number of offspring to produce in each GP generation. By default, OFFSPRING_SIZE = POPULATION_SIZE. -mr MUTATION_RATE [0.0, 1.0] GP mutation rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to apply random changes to every generation. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. -xr CROSSOVER_RATE [0.0, 1.0] GP crossover rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to \"breed\" every generation. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. -scoring SCORING_FN 'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'my_module.scorer_name*' Function used to evaluate the quality of a given pipeline for the problem. By default, accuracy is used for classification and mean squared error (MSE) is used for regression. TPOT assumes that any function with \"error\" or \"loss\" in the name is meant to be minimized, whereas any other functions will be maximized. my_module.scorer_name: You can also specify your own function or a full python path to an existing one. See the section on scoring functions for more details. -cv CV Any integer > 1 Number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. -sub SUBSAMPLE (0.0, 1.0] Subsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process. -njobs NUM_JOBS Any positive integer or -1 Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process. Assigning this to -1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. -maxtime MAX_TIME_MINS Any positive integer How many minutes TPOT has to optimize the pipeline. How many minutes TPOT has to optimize the pipeline.If not None, this setting will allow TPOT to run until max_time_mins minutes elapsed and then stop. TPOT will stop earlier if generationsis set and all generations are already evaluated. -maxeval MAX_EVAL_MINS Any positive float How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to consider more complex pipelines but will also allow TPOT to run longer. -s RANDOM_STATE Any positive integer Random number generator seed for reproducibility. Set this seed if you want your TPOT run to be reproducible with the same seed and data set in the future. -config CONFIG_FILE String or file path Operators and parameter configurations in TPOT: Path for configuration file: TPOT will use the path to a configuration file for customizing the operators and parameters that TPOT uses in the optimization process string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. -template TEMPLATE String Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. -memory MEMORY String or file path If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. Memory caching mode in TPOT: Path for a caching directory: TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown. string 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown. -cf CHECKPOINT_FOLDER Folder path If supplied, a folder you created, in which tpot will periodically save pipelines in pareto front so far while optimizing. This is useful in multiple cases: sudden death before tpot could save an optimized pipeline progress tracking grabbing a pipeline while tpot is working Example: mkdir my_checkpoints -cf ./my_checkpoints -es EARLY_STOP Any positive integer How many generations TPOT checks whether there is no improvement in optimization process. End optimization process if there is no improvement in the set number of generations. -v VERBOSITY {0, 1, 2, 3} How much information TPOT communicates while it is running. 0 = none, 1 = minimal, 2 = high, 3 = all. A setting of 2 or higher will add a progress bar during the optimization procedure. -log LOG Folder path Save progress content to a file. --no-update-check Flag indicating whether the TPOT version checker should be disabled. --version Show TPOT's version number and exit. --help Show TPOT's help documentation and exit. Scoring functions TPOT makes use of sklearn.model_selection.cross_val_score for evaluating pipelines, and as such offers the same support for scoring functions. There are two ways to make use of scoring functions with TPOT: You can pass in a string to the scoring parameter from the list above. Any other strings will cause TPOT to throw an exception. You can pass the callable object/function with signature scorer(estimator, X, y) , where estimator is trained estimator to use for scoring, X are features that will be passed to estimator.predict and y are target values for X . To do this, you should implement your own function. See the example below for further explanation. from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split from sklearn.metrics import make_scorer digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) # Make a custom metric function def my_custom_accuracy(y_true, y_pred): return float(sum(y_pred == y_true)) / len(y_true) # Make a custom a scorer from the custom metric function # Note: greater_is_better=False in make_scorer below would mean that the scoring function should be minimized. my_custom_scorer = make_scorer(my_custom_accuracy, greater_is_better=True) tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, scoring=my_custom_scorer) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') my_module.scorer_name : You can also use a custom score_func(y_true, y_pred) or scorer(estimator, X, y) function through the command line by adding the argument -scoring my_module.scorer to your command-line call. TPOT will import your module and use the custom scoring function from there. TPOT will include your current working directory when importing the module, so you can place it in the same directory where you are going to run TPOT. Example: -scoring sklearn.metrics.auc will use the function auc from sklearn.metrics module. Built-in TPOT configurations TPOT comes with a handful of default operators and parameter configurations that we believe work well for optimizing machine learning pipelines. Below is a list of the current built-in configurations that come with TPOT. Configuration Name Description Operators Default TPOT TPOT will search over a broad range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Some of these operators are complex and may take a long time to run, especially on larger datasets. Note: This is the default configuration for TPOT. To use this configuration, use the default value (None) for the config_dict parameter. Classification Regression TPOT light TPOT will search over a restricted range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Only simpler and fast-running operators will be used in these pipelines, so TPOT light is useful for finding quick and simple pipelines for a classification or regression problem. This configuration works for both the TPOTClassifier and TPOTRegressor. Classification Regression TPOT MDR TPOT will search over a series of feature selectors and Multifactor Dimensionality Reduction models to find a series of operators that maximize prediction accuracy. The TPOT MDR configuration is specialized for genome-wide association studies (GWAS) , and is described in detail online here . Note that TPOT MDR may be slow to run because the feature selection routines are computationally expensive, especially on large datasets. Classification Regression TPOT sparse TPOT uses a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices. This configuration works for both the TPOTClassifier and TPOTRegressor. Classification Regression TPOT-NN TPOT uses the same configuration as \"Default TPOT\" plus additional neural network estimators written in PyTorch (currently only `tpot.builtins.PytorchLRClassifier` and `tpot.builtins.PytorchMLPClassifier`). Currently only classification is supported, but future releases will include regression estimators. Classification To use any of these configurations, simply pass the string name of the configuration to the config_dict parameter (or -config on the command line). For example, to use the \"TPOT light\" configuration: from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, config_dict='TPOT light') tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Customizing TPOT's operators and parameters Beyond the default configurations that come with TPOT, in some cases it is useful to limit the algorithms and parameters that TPOT considers. For that reason, we allow users to provide TPOT with a custom configuration for its operators and parameters. The custom TPOT configuration must be in nested dictionary format, where the first level key is the path and name of the operator (e.g., sklearn.naive_bayes.MultinomialNB ) and the second level key is the corresponding parameter name for that operator (e.g., fit_prior ). The second level key should point to a list of parameter values for that parameter, e.g., 'fit_prior': [True, False] . For a simple example, the configuration could be: tpot_config = { 'sklearn.naive_bayes.GaussianNB': { }, 'sklearn.naive_bayes.BernoulliNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] }, 'sklearn.naive_bayes.MultinomialNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] } } in which case TPOT would only consider pipelines containing GaussianNB , BernoulliNB , MultinomialNB , and tune those algorithm's parameters in the ranges provided. This dictionary can be passed directly within the code to the TPOTClassifier / TPOTRegressor config_dict parameter, described above. For example: from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot_config = { 'sklearn.naive_bayes.GaussianNB': { }, 'sklearn.naive_bayes.BernoulliNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] }, 'sklearn.naive_bayes.MultinomialNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] } } tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, config_dict=tpot_config) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Command-line users must create a separate .py file with the custom configuration and provide the path to the file to the tpot call. For example, if the simple example configuration above is saved in tpot_classifier_config.py , that configuration could be used on the command line with the command: tpot data/mnist.csv -is , -target class -config tpot_classifier_config.py -g 5 -p 20 -v 2 -o tpot_exported_pipeline.py When using the command-line interface, the configuration file specified in the -config parameter must name its custom TPOT configuration tpot_config . Otherwise, TPOT will not be able to locate the configuration dictionary. For more detailed examples of how to customize TPOT's operator configuration, see the default configurations for classification and regression in TPOT's source code. Note that you must have all of the corresponding packages for the operators installed on your computer, otherwise TPOT will not be able to use them. For example, if XGBoost is not installed on your computer, then TPOT will simply not import nor use XGBoost in the pipelines it considers. Template option in TPOT Template option provides a way to specify a desired structure for machine learning pipeline, which may reduce TPOT computation time and potentially provide more interpretable results. Current implementation only supports linear pipelines. Below is a simple example to use template option. The pipelines generated/evaluated in TPOT will follow this structure: 1st step is a feature selector (a subclass of SelectorMixin ), 2nd step is a feature transformer (a subclass of TransformerMixin ) and 3rd step is a classifier for classification (a subclass of ClassifierMixin ). The last step must be Classifier for TPOTClassifier 's template but Regressor for TPOTRegressor . Note: although SelectorMixin is subclass of TransformerMixin in scikit-learn, but Transformer in this option excludes those subclasses of SelectorMixin . tpot_obj = TPOTClassifier( template='Selector-Transformer-Classifier' ) If a specific operator, e.g. SelectPercentile , is preferred for usage in the 1st step of the pipeline, the template can be defined like 'SelectPercentile-Transformer-Classifier'. FeatureSetSelector in TPOT FeatureSetSelector is a special new operator in TPOT. This operator enables feature selection based on priori expert knowledge. For example, in RNA-seq gene expression analysis, this operator can be used to select one or more gene (feature) set(s) based on GO (Gene Ontology) terms or annotated gene sets Molecular Signatures Database ( MSigDB ) in the 1st step of pipeline via template option above, in order to reduce dimensions and TPOT computation time. This operator requires a dataset list in csv format. In this csv file, there are only three columns: 1st column is feature set names, 2nd column is the total number of features in one set and 3rd column is a list of feature names (if input X is pandas.DataFrame) or indexes (if input X is numpy.ndarray) delimited by \";\". Below is a example how to use this operator in TPOT. Please check our preprint paper for more details. from tpot import TPOTClassifier import numpy as np import pandas as pd from tpot.config import classifier_config_dict test_data = pd.read_csv(\"https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/tests.csv\") test_X = test_data.drop(\"class\", axis=1) test_y = test_data['class'] # add FeatureSetSelector into tpot configuration classifier_config_dict['tpot.builtins.FeatureSetSelector'] = { 'subset_list': ['https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/subset_test.csv'], 'sel_subset': [0,1] # select only one feature set, a list of index of subset in the list above #'sel_subset': list(combinations(range(3), 2)) # select two feature sets } tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, template='FeatureSetSelector-Transformer-Classifier', config_dict=classifier_config_dict) tpot.fit(test_X, test_y) Pipeline caching in TPOT With the memory parameter, pipelines can cache the results of each transformer after fitting them. This feature is used to avoid repeated computation by transformers within a pipeline if the parameters and input data are identical to another fitted pipeline during optimization process. TPOT allows users to specify a custom directory path or joblib.Memory in case they want to re-use the memory cache in future TPOT runs (or a warm_start run). There are three methods for enabling memory caching in TPOT: from tpot import TPOTClassifier from tempfile import mkdtemp from joblib import Memory from shutil import rmtree # Method 1, auto mode: TPOT uses memory caching with a temporary directory and cleans it up upon shutdown tpot = TPOTClassifier(memory='auto') # Method 2, with a custom directory for memory caching tpot = TPOTClassifier(memory='/to/your/path') # Method 3, with a Memory object cachedir = mkdtemp() # Create a temporary folder memory = Memory(cachedir=cachedir, verbose=0) tpot = TPOTClassifier(memory=memory) # Clear the cache directory when you don't need it anymore rmtree(cachedir) Note: TPOT does NOT clean up memory caches if users set a custom directory path or Memory object. We recommend that you clean up the memory caches when you don't need it anymore. Crash/freeze issue with n_jobs > 1 under OSX or Linux Internally, TPOT uses joblib to fit estimators in parallel. This is the same parallelization framework used by scikit-learn. But it may crash/freeze with n_jobs > 1 under OSX or Linux as scikit-learn does , especially with large datasets. One solution is to configure Python's multiprocessing module to use the forkserver start method (instead of the default fork ) to manage the process pools. You can enable the forkserver mode globally for your program by putting the following codes into your main script: import multiprocessing # other imports, custom code, load data, define model... if __name__ == '__main__': multiprocessing.set_start_method('forkserver') # call scikit-learn utils or tpot utils with n_jobs > 1 here More information about these start methods can be found in the multiprocessing documentation . Parallel Training with Dask For large problems or working on Jupyter notebook, we highly recommend that you can distribute the work on a Dask cluster. The dask-examples binder has a runnable example with a small dask cluster. To use your Dask cluster to fit a TPOT model, specify the use_dask keyword when you create the TPOT estimator. Note: if use_dask=True , TPOT will use as many cores as available on the your Dask cluster. If n_jobs is specified, then it will control the chunk size (10* n_jobs if it is less then offspring size) of parallel training. estimator = TPOTEstimator(use_dask=True, n_jobs=-1) This will use use all the workers on your cluster to do the training, and use Dask-ML's pipeline rewriting to avoid re-fitting estimators multiple times on the same set of data. It will also provide fine-grained diagnostics in the distributed scheduler UI . Alternatively, Dask implements a joblib backend. You can instruct TPOT to use the distributed backend during training by specifying a joblib.parallel_backend : import joblib import distributed.joblib from dask.distributed import Client # connect to the cluster client = Client('schedueler-address') # create the estimator normally estimator = TPOTClassifier(n_jobs=-1) # perform the fit in this context manager with joblib.parallel_backend(\"dask\"): estimator.fit(X, y) See dask's distributed joblib integration for more. Neural Networks in TPOT ( tpot.nn ) Support for neural network models and deep learning is an experimental feature newly added to TPOT. Available neural network architectures are provided by the tpot.nn module. Unlike regular sklearn estimators, these models need to be written by hand, and must also inherit the appropriate base classes provided by sklearn for all of their built-in modules. In other words, they need implement methods like .fit() , fit_transform() , get_params() , etc., as described in detail on Developing scikit-learn estimators . Telling TPOT to use built-in PyTorch neural network models Mainly due to the issues described below, TPOT won't use its neural network models unless you explicitly tell it to do so. This is done as follows: Use import tpot.nn before instantiating any TPOT estimators. Use a configuration dictionary that includes one or more tpot.nn estimators, either by writing one manually, including one from a file, or by importing the configuration in tpot/config/classifier_nn.py . A very simple example that will force TPOT to only use a PyTorch-based logistic regression classifier as its main estimator is as follows: tpot_config = { 'tpot.nn.PytorchLRClassifier': { 'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.] } } Alternatively, use a template string including PytorchLRClassifier or PytorchMLPClassifier while loading the TPOT-NN configuration dictionary. Neural network models are notorious for being extremely sensitive to their initialization parameters, so you may need to heavily adjust tpot.nn configuration dictionaries in order to attain good performance on your dataset. A simple example of using TPOT-NN is shown in examples . Important caveats Neural network models (especially when they reach moderately large sizes) take a notoriously large amount of time and computing power to train. You should expect tpot.nn neural networks to train several orders of magnitude slower than their sklearn alternatives. This can be alleviated somewhat by training the models on computers with CUDA-enabled GPUs. TPOT will occasionally learn pipelines that stack several sklearn estimators. Mathematically, these can be nearly identical to some deep learning models. For example, by stacking several sklearn.linear_model.LogisticRegression s, you end up with a very close approximation of a Multilayer Perceptron; one of the simplest and most well known deep learning architectures. TPOT's genetic programming algorithms generally optimize these 'networks' much faster than PyTorch, which typically uses a more brute-force convex optimization approach. The problem of 'black box' model introspection is one of the most substantial criticisms and challenges of deep learning. This problem persists in tpot.nn , whereas TPOT's default estimators often are far easier to introspect.","title":"Using TPOT"},{"location":"using/#using-tpot","text":"","title":"Using TPOT"},{"location":"using/#what-to-expect-from-automl-software","text":"Automated machine learning (AutoML) takes a higher-level approach to machine learning than most practitioners are used to, so we've gathered a handful of guidelines on what to expect when running AutoML software such as TPOT.","title":"What to expect from AutoML software"},{"location":"using/#tpot-with-code","text":"We've taken care to design the TPOT interface to be as similar as possible to scikit-learn. TPOT can be imported just like any regular Python module. To import TPOT, type: from tpot import TPOTClassifier then create an instance of TPOT as follows: pipeline_optimizer = TPOTClassifier() It's also possible to use TPOT for regression problems with the TPOTRegressor class. Other than the class name, a TPOTRegressor is used the same way as a TPOTClassifier . You can read more about the TPOTClassifier and TPOTRegressor classes in the API documentation . Some example code with custom TPOT parameters might look like: pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) Now TPOT is ready to optimize a pipeline for you. You can tell TPOT to optimize a pipeline based on a data set with the fit function: pipeline_optimizer.fit(X_train, y_train) The fit function initializes the genetic programming algorithm to find the highest-scoring pipeline based on average k-fold cross-validation Then, the pipeline is trained on the entire set of provided samples, and the TPOT instance can be used as a fitted model. You can then proceed to evaluate the final pipeline on the testing set with the score function: print(pipeline_optimizer.score(X_test, y_test)) Finally, you can tell TPOT to export the corresponding Python code for the optimized pipeline to a text file with the export function: pipeline_optimizer.export('tpot_exported_pipeline.py') Once this code finishes running, tpot_exported_pipeline.py will contain the Python code for the optimized pipeline. Below is a full example script using TPOT to optimize a pipeline, score it, and export the best pipeline to a file. from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) pipeline_optimizer.fit(X_train, y_train) print(pipeline_optimizer.score(X_test, y_test)) pipeline_optimizer.export('tpot_exported_pipeline.py') Check our examples to see TPOT applied to some specific data sets.","title":"TPOT with code"},{"location":"using/#tpot-on-the-command-line","text":"To use TPOT via the command line, enter the following command with a path to the data file: tpot /path_to/data_file.csv An example command-line call to TPOT may look like: tpot data/mnist.csv -is , -target class -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2 TPOT offers several arguments that can be provided at the command line. To see brief descriptions of these arguments, enter the following command: tpot --help Detailed descriptions of the command-line arguments are below. Argument Parameter Valid values Effect -is INPUT_SEPARATOR Any string Character used to separate columns in the input file. -target TARGET_NAME Any string Name of the target column in the input file. -mode TPOT_MODE ['classification', 'regression'] Whether TPOT is being used for a supervised classification or regression problem. -o OUTPUT_FILE String path to a file File to export the code for the final optimized pipeline. -g GENERATIONS Any positive integer or None Number of iterations to run the pipeline optimization process. It must be a positive number or None. If None, the parameter max_time_mins must be defined as the runtime limit. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -p POPULATION_SIZE Any positive integer Number of individuals to retain in the GP population every generation. Generally, TPOT will work better when you give it more individuals (and therefore time) to optimize the pipeline. TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -os OFFSPRING_SIZE Any positive integer Number of offspring to produce in each GP generation. By default, OFFSPRING_SIZE = POPULATION_SIZE. -mr MUTATION_RATE [0.0, 1.0] GP mutation rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to apply random changes to every generation. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. -xr CROSSOVER_RATE [0.0, 1.0] GP crossover rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to \"breed\" every generation. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. -scoring SCORING_FN 'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'my_module.scorer_name*' Function used to evaluate the quality of a given pipeline for the problem. By default, accuracy is used for classification and mean squared error (MSE) is used for regression. TPOT assumes that any function with \"error\" or \"loss\" in the name is meant to be minimized, whereas any other functions will be maximized. my_module.scorer_name: You can also specify your own function or a full python path to an existing one. See the section on scoring functions for more details. -cv CV Any integer > 1 Number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. -sub SUBSAMPLE (0.0, 1.0] Subsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process. -njobs NUM_JOBS Any positive integer or -1 Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process. Assigning this to -1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. -maxtime MAX_TIME_MINS Any positive integer How many minutes TPOT has to optimize the pipeline. How many minutes TPOT has to optimize the pipeline.If not None, this setting will allow TPOT to run until max_time_mins minutes elapsed and then stop. TPOT will stop earlier if generationsis set and all generations are already evaluated. -maxeval MAX_EVAL_MINS Any positive float How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to consider more complex pipelines but will also allow TPOT to run longer. -s RANDOM_STATE Any positive integer Random number generator seed for reproducibility. Set this seed if you want your TPOT run to be reproducible with the same seed and data set in the future. -config CONFIG_FILE String or file path Operators and parameter configurations in TPOT: Path for configuration file: TPOT will use the path to a configuration file for customizing the operators and parameters that TPOT uses in the optimization process string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. -template TEMPLATE String Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. -memory MEMORY String or file path If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. Memory caching mode in TPOT: Path for a caching directory: TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown. string 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown. -cf CHECKPOINT_FOLDER Folder path If supplied, a folder you created, in which tpot will periodically save pipelines in pareto front so far while optimizing. This is useful in multiple cases: sudden death before tpot could save an optimized pipeline progress tracking grabbing a pipeline while tpot is working Example: mkdir my_checkpoints -cf ./my_checkpoints -es EARLY_STOP Any positive integer How many generations TPOT checks whether there is no improvement in optimization process. End optimization process if there is no improvement in the set number of generations. -v VERBOSITY {0, 1, 2, 3} How much information TPOT communicates while it is running. 0 = none, 1 = minimal, 2 = high, 3 = all. A setting of 2 or higher will add a progress bar during the optimization procedure. -log LOG Folder path Save progress content to a file. --no-update-check Flag indicating whether the TPOT version checker should be disabled. --version Show TPOT's version number and exit. --help Show TPOT's help documentation and exit.","title":"TPOT on the command line"},{"location":"using/#scoring-functions","text":"TPOT makes use of sklearn.model_selection.cross_val_score for evaluating pipelines, and as such offers the same support for scoring functions. There are two ways to make use of scoring functions with TPOT: You can pass in a string to the scoring parameter from the list above. Any other strings will cause TPOT to throw an exception. You can pass the callable object/function with signature scorer(estimator, X, y) , where estimator is trained estimator to use for scoring, X are features that will be passed to estimator.predict and y are target values for X . To do this, you should implement your own function. See the example below for further explanation. from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split from sklearn.metrics import make_scorer digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) # Make a custom metric function def my_custom_accuracy(y_true, y_pred): return float(sum(y_pred == y_true)) / len(y_true) # Make a custom a scorer from the custom metric function # Note: greater_is_better=False in make_scorer below would mean that the scoring function should be minimized. my_custom_scorer = make_scorer(my_custom_accuracy, greater_is_better=True) tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, scoring=my_custom_scorer) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') my_module.scorer_name : You can also use a custom score_func(y_true, y_pred) or scorer(estimator, X, y) function through the command line by adding the argument -scoring my_module.scorer to your command-line call. TPOT will import your module and use the custom scoring function from there. TPOT will include your current working directory when importing the module, so you can place it in the same directory where you are going to run TPOT. Example: -scoring sklearn.metrics.auc will use the function auc from sklearn.metrics module.","title":"Scoring functions"},{"location":"using/#built-in-tpot-configurations","text":"TPOT comes with a handful of default operators and parameter configurations that we believe work well for optimizing machine learning pipelines. Below is a list of the current built-in configurations that come with TPOT. Configuration Name Description Operators Default TPOT TPOT will search over a broad range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Some of these operators are complex and may take a long time to run, especially on larger datasets. Note: This is the default configuration for TPOT. To use this configuration, use the default value (None) for the config_dict parameter. Classification Regression TPOT light TPOT will search over a restricted range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Only simpler and fast-running operators will be used in these pipelines, so TPOT light is useful for finding quick and simple pipelines for a classification or regression problem. This configuration works for both the TPOTClassifier and TPOTRegressor. Classification Regression TPOT MDR TPOT will search over a series of feature selectors and Multifactor Dimensionality Reduction models to find a series of operators that maximize prediction accuracy. The TPOT MDR configuration is specialized for genome-wide association studies (GWAS) , and is described in detail online here . Note that TPOT MDR may be slow to run because the feature selection routines are computationally expensive, especially on large datasets. Classification Regression TPOT sparse TPOT uses a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices. This configuration works for both the TPOTClassifier and TPOTRegressor. Classification Regression TPOT-NN TPOT uses the same configuration as \"Default TPOT\" plus additional neural network estimators written in PyTorch (currently only `tpot.builtins.PytorchLRClassifier` and `tpot.builtins.PytorchMLPClassifier`). Currently only classification is supported, but future releases will include regression estimators. Classification To use any of these configurations, simply pass the string name of the configuration to the config_dict parameter (or -config on the command line). For example, to use the \"TPOT light\" configuration: from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, config_dict='TPOT light') tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py')","title":"Built-in TPOT configurations"},{"location":"using/#customizing-tpots-operators-and-parameters","text":"Beyond the default configurations that come with TPOT, in some cases it is useful to limit the algorithms and parameters that TPOT considers. For that reason, we allow users to provide TPOT with a custom configuration for its operators and parameters. The custom TPOT configuration must be in nested dictionary format, where the first level key is the path and name of the operator (e.g., sklearn.naive_bayes.MultinomialNB ) and the second level key is the corresponding parameter name for that operator (e.g., fit_prior ). The second level key should point to a list of parameter values for that parameter, e.g., 'fit_prior': [True, False] . For a simple example, the configuration could be: tpot_config = { 'sklearn.naive_bayes.GaussianNB': { }, 'sklearn.naive_bayes.BernoulliNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] }, 'sklearn.naive_bayes.MultinomialNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] } } in which case TPOT would only consider pipelines containing GaussianNB , BernoulliNB , MultinomialNB , and tune those algorithm's parameters in the ranges provided. This dictionary can be passed directly within the code to the TPOTClassifier / TPOTRegressor config_dict parameter, described above. For example: from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot_config = { 'sklearn.naive_bayes.GaussianNB': { }, 'sklearn.naive_bayes.BernoulliNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] }, 'sklearn.naive_bayes.MultinomialNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] } } tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, config_dict=tpot_config) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Command-line users must create a separate .py file with the custom configuration and provide the path to the file to the tpot call. For example, if the simple example configuration above is saved in tpot_classifier_config.py , that configuration could be used on the command line with the command: tpot data/mnist.csv -is , -target class -config tpot_classifier_config.py -g 5 -p 20 -v 2 -o tpot_exported_pipeline.py When using the command-line interface, the configuration file specified in the -config parameter must name its custom TPOT configuration tpot_config . Otherwise, TPOT will not be able to locate the configuration dictionary. For more detailed examples of how to customize TPOT's operator configuration, see the default configurations for classification and regression in TPOT's source code. Note that you must have all of the corresponding packages for the operators installed on your computer, otherwise TPOT will not be able to use them. For example, if XGBoost is not installed on your computer, then TPOT will simply not import nor use XGBoost in the pipelines it considers.","title":"Customizing TPOT's operators and parameters"},{"location":"using/#template-option-in-tpot","text":"Template option provides a way to specify a desired structure for machine learning pipeline, which may reduce TPOT computation time and potentially provide more interpretable results. Current implementation only supports linear pipelines. Below is a simple example to use template option. The pipelines generated/evaluated in TPOT will follow this structure: 1st step is a feature selector (a subclass of SelectorMixin ), 2nd step is a feature transformer (a subclass of TransformerMixin ) and 3rd step is a classifier for classification (a subclass of ClassifierMixin ). The last step must be Classifier for TPOTClassifier 's template but Regressor for TPOTRegressor . Note: although SelectorMixin is subclass of TransformerMixin in scikit-learn, but Transformer in this option excludes those subclasses of SelectorMixin . tpot_obj = TPOTClassifier( template='Selector-Transformer-Classifier' ) If a specific operator, e.g. SelectPercentile , is preferred for usage in the 1st step of the pipeline, the template can be defined like 'SelectPercentile-Transformer-Classifier'.","title":"Template option in TPOT"},{"location":"using/#featuresetselector-in-tpot","text":"FeatureSetSelector is a special new operator in TPOT. This operator enables feature selection based on priori expert knowledge. For example, in RNA-seq gene expression analysis, this operator can be used to select one or more gene (feature) set(s) based on GO (Gene Ontology) terms or annotated gene sets Molecular Signatures Database ( MSigDB ) in the 1st step of pipeline via template option above, in order to reduce dimensions and TPOT computation time. This operator requires a dataset list in csv format. In this csv file, there are only three columns: 1st column is feature set names, 2nd column is the total number of features in one set and 3rd column is a list of feature names (if input X is pandas.DataFrame) or indexes (if input X is numpy.ndarray) delimited by \";\". Below is a example how to use this operator in TPOT. Please check our preprint paper for more details. from tpot import TPOTClassifier import numpy as np import pandas as pd from tpot.config import classifier_config_dict test_data = pd.read_csv(\"https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/tests.csv\") test_X = test_data.drop(\"class\", axis=1) test_y = test_data['class'] # add FeatureSetSelector into tpot configuration classifier_config_dict['tpot.builtins.FeatureSetSelector'] = { 'subset_list': ['https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/subset_test.csv'], 'sel_subset': [0,1] # select only one feature set, a list of index of subset in the list above #'sel_subset': list(combinations(range(3), 2)) # select two feature sets } tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, template='FeatureSetSelector-Transformer-Classifier', config_dict=classifier_config_dict) tpot.fit(test_X, test_y)","title":"FeatureSetSelector in TPOT"},{"location":"using/#pipeline-caching-in-tpot","text":"With the memory parameter, pipelines can cache the results of each transformer after fitting them. This feature is used to avoid repeated computation by transformers within a pipeline if the parameters and input data are identical to another fitted pipeline during optimization process. TPOT allows users to specify a custom directory path or joblib.Memory in case they want to re-use the memory cache in future TPOT runs (or a warm_start run). There are three methods for enabling memory caching in TPOT: from tpot import TPOTClassifier from tempfile import mkdtemp from joblib import Memory from shutil import rmtree # Method 1, auto mode: TPOT uses memory caching with a temporary directory and cleans it up upon shutdown tpot = TPOTClassifier(memory='auto') # Method 2, with a custom directory for memory caching tpot = TPOTClassifier(memory='/to/your/path') # Method 3, with a Memory object cachedir = mkdtemp() # Create a temporary folder memory = Memory(cachedir=cachedir, verbose=0) tpot = TPOTClassifier(memory=memory) # Clear the cache directory when you don't need it anymore rmtree(cachedir) Note: TPOT does NOT clean up memory caches if users set a custom directory path or Memory object. We recommend that you clean up the memory caches when you don't need it anymore.","title":"Pipeline caching in TPOT"},{"location":"using/#crashfreeze-issue-with-n_jobs-1-under-osx-or-linux","text":"Internally, TPOT uses joblib to fit estimators in parallel. This is the same parallelization framework used by scikit-learn. But it may crash/freeze with n_jobs > 1 under OSX or Linux as scikit-learn does , especially with large datasets. One solution is to configure Python's multiprocessing module to use the forkserver start method (instead of the default fork ) to manage the process pools. You can enable the forkserver mode globally for your program by putting the following codes into your main script: import multiprocessing # other imports, custom code, load data, define model... if __name__ == '__main__': multiprocessing.set_start_method('forkserver') # call scikit-learn utils or tpot utils with n_jobs > 1 here More information about these start methods can be found in the multiprocessing documentation .","title":"Crash/freeze issue with n_jobs > 1 under OSX or Linux"},{"location":"using/#parallel-training-with-dask","text":"For large problems or working on Jupyter notebook, we highly recommend that you can distribute the work on a Dask cluster. The dask-examples binder has a runnable example with a small dask cluster. To use your Dask cluster to fit a TPOT model, specify the use_dask keyword when you create the TPOT estimator. Note: if use_dask=True , TPOT will use as many cores as available on the your Dask cluster. If n_jobs is specified, then it will control the chunk size (10* n_jobs if it is less then offspring size) of parallel training. estimator = TPOTEstimator(use_dask=True, n_jobs=-1) This will use use all the workers on your cluster to do the training, and use Dask-ML's pipeline rewriting to avoid re-fitting estimators multiple times on the same set of data. It will also provide fine-grained diagnostics in the distributed scheduler UI . Alternatively, Dask implements a joblib backend. You can instruct TPOT to use the distributed backend during training by specifying a joblib.parallel_backend : import joblib import distributed.joblib from dask.distributed import Client # connect to the cluster client = Client('schedueler-address') # create the estimator normally estimator = TPOTClassifier(n_jobs=-1) # perform the fit in this context manager with joblib.parallel_backend(\"dask\"): estimator.fit(X, y) See dask's distributed joblib integration for more.","title":"Parallel Training with Dask"},{"location":"using/#neural-networks-in-tpot-tpotnn","text":"Support for neural network models and deep learning is an experimental feature newly added to TPOT. Available neural network architectures are provided by the tpot.nn module. Unlike regular sklearn estimators, these models need to be written by hand, and must also inherit the appropriate base classes provided by sklearn for all of their built-in modules. In other words, they need implement methods like .fit() , fit_transform() , get_params() , etc., as described in detail on Developing scikit-learn estimators .","title":"Neural Networks in TPOT (tpot.nn)"},{"location":"using/#telling-tpot-to-use-built-in-pytorch-neural-network-models","text":"Mainly due to the issues described below, TPOT won't use its neural network models unless you explicitly tell it to do so. This is done as follows: Use import tpot.nn before instantiating any TPOT estimators. Use a configuration dictionary that includes one or more tpot.nn estimators, either by writing one manually, including one from a file, or by importing the configuration in tpot/config/classifier_nn.py . A very simple example that will force TPOT to only use a PyTorch-based logistic regression classifier as its main estimator is as follows: tpot_config = { 'tpot.nn.PytorchLRClassifier': { 'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.] } } Alternatively, use a template string including PytorchLRClassifier or PytorchMLPClassifier while loading the TPOT-NN configuration dictionary. Neural network models are notorious for being extremely sensitive to their initialization parameters, so you may need to heavily adjust tpot.nn configuration dictionaries in order to attain good performance on your dataset. A simple example of using TPOT-NN is shown in examples .","title":"Telling TPOT to use built-in PyTorch neural network models"},{"location":"using/#important-caveats","text":"Neural network models (especially when they reach moderately large sizes) take a notoriously large amount of time and computing power to train. You should expect tpot.nn neural networks to train several orders of magnitude slower than their sklearn alternatives. This can be alleviated somewhat by training the models on computers with CUDA-enabled GPUs. TPOT will occasionally learn pipelines that stack several sklearn estimators. Mathematically, these can be nearly identical to some deep learning models. For example, by stacking several sklearn.linear_model.LogisticRegression s, you end up with a very close approximation of a Multilayer Perceptron; one of the simplest and most well known deep learning architectures. TPOT's genetic programming algorithms generally optimize these 'networks' much faster than PyTorch, which typically uses a more brute-force convex optimization approach. The problem of 'black box' model introspection is one of the most substantial criticisms and challenges of deep learning. This problem persists in tpot.nn , whereas TPOT's default estimators often are far easier to introspect.","title":"Important caveats"}]} \ No newline at end of file +{"config":{"lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"Consider TPOT your Data Science Assistant . TPOT is a Python Automated Machine Learning tool that optimizes machine learning pipelines using genetic programming. TPOT will automate the most tedious part of machine learning by intelligently exploring thousands of possible pipelines to find the best one for your data. An example machine learning pipeline Once TPOT is finished searching (or you get tired of waiting), it provides you with the Python code for the best pipeline it found so you can tinker with the pipeline from there. An example TPOT pipeline TPOT is built on top of scikit-learn, so all of the code it generates should look familiar... if you're familiar with scikit-learn, anyway. TPOT is still under active development and we encourage you to check back on this repository regularly for updates.","title":"Home"},{"location":"api/","text":"TPOT API Classification class tpot. TPOTClassifier ( generations =100, population_size =100, offspring_size =None, mutation_rate =0.9, crossover_rate =0.1, scoring ='accuracy', cv =5, subsample =1.0, n_jobs =1, max_time_mins =None, max_eval_time_mins =5, random_state =None, config_dict =None, template =None, warm_start =False, memory =None, use_dask =False, periodic_checkpoint_folder =None, early_stop =None, verbosity =0, disable_update_check =False, log_file =None ) source Automated machine learning for supervised classification tasks. The TPOTClassifier performs an intelligent search over machine learning pipelines that can contain supervised classification models, preprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API . The TPOTClassifier will also search over the hyperparameters of all objects in the pipeline. By default, TPOTClassifier will search over a broad range of supervised classification algorithms, transformers, and their parameters. However, the algorithms, transformers, and hyperparameters that the TPOTClassifier searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int or None optional (default=100) Number of iterations to the run pipeline optimization process. It must be a positive number or None. If None, the parameter max_time_mins must be defined as the runtime limit. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) Number of individuals to retain in the genetic programming population every generation. Must be a positive number. Generally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=None) Number of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size. mutation_rate : float, optional (default=0.9) Mutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) Crossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='accuracy') Function used to evaluate the quality of a given pipeline for the classification problem. The following built-in scoring functions can be used: 'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'precision' etc. (suffixes apply as with \u2018f1\u2019), 'recall' etc. (suffixes apply as with \u2018f1\u2019), \u2018jaccard\u2019 etc. (suffixes apply as with \u2018f1\u2019), 'roc_auc', \u2018roc_auc_ovr\u2019, \u2018roc_auc_ovo\u2019, \u2018roc_auc_ovr_weighted\u2019, \u2018roc_auc_ovo_weighted\u2019 If you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y) . See the section on scoring functions for more details. cv : int, cross-validation generator, or an iterable, optional (default=5) Cross-validation strategy used when evaluating pipelines. Possible inputs: integer, to specify the number of folds in a StratifiedKFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) Fraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. Setting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) Number of processes to use in parallel for evaluating pipelines during the TPOT optimization process. Setting n_jobs =-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets. max_time_mins : integer or None, optional (default=None) How many minutes TPOT has to optimize the pipeline. If not None, this setting will allow TPOT to run until max_time_mins minutes elapsed and then stop. TPOT will stop earlier if generations is set and all generations are already evaluated. max_eval_time_mins : float, optional (default=5) How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) The seed of the pseudo random number generator used in TPOT. Use this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) A configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. Possible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTClassifier configuration. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. template : string (default=None) Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. warm_start : boolean, optional (default=False) Flag indicating whether the TPOT instance will reuse the population from previous calls to fit() . Setting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. memory : a joblib.Memory object or string, optional (default=None) If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in scikit-learn documentation Possible inputs are: String 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or Path of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or Memory object, TPOT uses the instance of joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or None, TPOT does not use memory caching. use_dask : boolean, optional (default: False) Whether to use Dask-ML's pipeline optimiziations. This avoid re-fitting the same estimator on the same split of data multiple times. It will also provide more detailed diagnostics when using Dask's distributed scheduler. See avoid repeated work for more details. periodic_checkpoint_folder : path string, optional (default: None) If supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing. Currently once per generation but not more often than once per 30 seconds. Useful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) How many generations TPOT checks whether there is no improvement in optimization process. Ends the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) How much information TPOT communicates while it's running. Possible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) Flag indicating whether the TPOT version checker should be disabled. The update checker will tell you when a new version of TPOT has been released. log_file : file-like class (io.TextIOWrapper or io.StringIO) or string, optional (default: None) Save progress content to a file. If it is a string for the path and file name of the desired output file, TPOT will create the file and write log into it. If it is None, TPOT will output log into sys.stdout Attributes: fitted_pipeline_ : scikit-learn Pipeline object The best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary Dictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. The TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. Note: pareto_front_fitted_pipelines_ is only available when verbosity =3. evaluated_individuals_ : Python dictionary Dictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). This attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Functions fit (features, classes[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the classes for a feature set. predict_proba (features) Use the optimized pipeline to estimate the class probabilities for a feature set. score (testing_features, testing_classes) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, classes, sample_weight=None, groups=None) Run the TPOT optimization process on the given training data. Uses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples. Parameters: features : array-like {n_samples, n_features} Feature matrix TPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values. As such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed) using median value imputation . If you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. classes : array-like {n_samples} List of class labels for prediction sample_weight : array-like {n_samples}, optional Per-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines. groups : array-like, with shape {n_samples, }, optional Group labels for the samples used when performing cross-validation. This parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object Returns a copy of the fitted TPOT object predict(features) Use the optimized pipeline to predict the classes for a feature set. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples} Predicted classes for the samples in the feature matrix predict_proba(features) Use the optimized pipeline to estimate the class probabilities for a feature set. Note: This function will only work for pipelines whose final classifier supports the predict_proba function. TPOT will raise an error otherwise. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples, n_classes} The class probabilities of the input samples score(testing_features, testing_classes) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. The default scoring function for TPOTClassifier is 'accuracy'. Parameters: testing_features : array-like {n_samples, n_features} Feature matrix of the testing set testing_classes : array-like {n_samples} List of class labels for prediction in the testing set Returns: accuracy_score : float The estimated test set accuracy according to the user-specified scoring function. export(output_file_name, data_file_path) Export the optimized pipeline as Python code. See the usage documentation for example usage of the export function. Parameters: output_file_name : string String containing the path and file name of the desired output file data_file_path : string By default, the path of input dataset is 'PATH/TO/DATA/FILE' by default. If data_file_path is another string, the path will be replaced. Returns: exported_code_string : string The whole pipeline text as a string should be returned if output_file_name is not specified. Regression class tpot. TPOTRegressor ( generations =100, population_size =100, offspring_size =None, mutation_rate =0.9, crossover_rate =0.1, scoring ='neg_mean_squared_error', cv =5, subsample =1.0, n_jobs =1, max_time_mins =None, max_eval_time_mins =5, random_state =None, config_dict =None, template =None, warm_start =False, memory =None, use_dask =False, periodic_checkpoint_folder =None, early_stop =None, verbosity =0, disable_update_check =False ) source Automated machine learning for supervised regression tasks. The TPOTRegressor performs an intelligent search over machine learning pipelines that can contain supervised regression models, preprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API . The TPOTRegressor will also search over the hyperparameters of all objects in the pipeline. By default, TPOTRegressor will search over a broad range of supervised regression models, transformers, and their hyperparameters. However, the models, transformers, and parameters that the TPOTRegressor searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int or None, optional (default=100) Number of iterations to the run pipeline optimization process. It must be a positive number or None. If None, the parameter max_time_mins must be defined as the runtime limit. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) Number of individuals to retain in the genetic programming population every generation. Must be a positive number. Generally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=None) Number of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size. mutation_rate : float, optional (default=0.9) Mutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) Crossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='neg_mean_squared_error') Function used to evaluate the quality of a given pipeline for the regression problem. The following built-in scoring functions can be used: 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'r2' Note that we recommend using the neg version of mean squared error and related metrics so TPOT will minimize (instead of maximize) the metric. If you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y) . See the section on scoring functions for more details. cv : int, cross-validation generator, or an iterable, optional (default=5) Cross-validation strategy used when evaluating pipelines. Possible inputs: integer, to specify the number of folds in a KFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) Fraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. Setting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) Number of processes to use in parallel for evaluating pipelines during the TPOT optimization process. Setting n_jobs =-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets max_time_mins : integer or None, optional (default=None) How many minutes TPOT has to optimize the pipeline. If not None, this setting will allow TPOT to run until max_time_mins minutes elapsed and then stop. TPOT will stop earlier if generations is set and all generations are already evaluated. max_eval_time_mins : float, optional (default=5) How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) The seed of the pseudo random number generator used in TPOT. Use this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) A configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. Possible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTRegressor configuration. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. template : string (default=None) Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Regressor\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. warm_start : boolean, optional (default=False) Flag indicating whether the TPOT instance will reuse the population from previous calls to fit() . Setting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. memory : a joblib.Memory object or string, optional (default=None) If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in scikit-learn documentation Possible inputs are: String 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or Path of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or Memory object, TPOT uses the instance of joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or None, TPOT does not use memory caching. use_dask : boolean, optional (default: False) Whether to use Dask-ML's pipeline optimiziations. This avoid re-fitting the same estimator on the same split of data multiple times. It will also provide more detailed diagnostics when using Dask's distributed scheduler. See avoid repeated work for more details. periodic_checkpoint_folder : path string, optional (default: None) If supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing. Currently once per generation but not more often than once per 30 seconds. Useful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) How many generations TPOT checks whether there is no improvement in optimization process. Ends the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) How much information TPOT communicates while it's running. Possible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) Flag indicating whether the TPOT version checker should be disabled. The update checker will tell you when a new version of TPOT has been released. Attributes: fitted_pipeline_ : scikit-learn Pipeline object The best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary Dictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. The TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. Note: _pareto_front_fitted_pipelines is only available when verbosity =3. evaluated_individuals_ : Python dictionary Dictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). This attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTRegressor from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split digits = load_boston() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py') Functions fit (features, target[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the target values for a feature set. score (testing_features, testing_target) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, target, sample_weight=None, groups=None) Run the TPOT optimization process on the given training data. Uses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples. Parameters: features : array-like {n_samples, n_features} Feature matrix TPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values. As such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed) using median value imputation . If you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. target : array-like {n_samples} List of target labels for prediction sample_weight : array-like {n_samples}, optional Per-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines. groups : array-like, with shape {n_samples, }, optional Group labels for the samples used when performing cross-validation. This parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object Returns a copy of the fitted TPOT object predict(features) Use the optimized pipeline to predict the target values for a feature set. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples} Predicted target values for the samples in the feature matrix score(testing_features, testing_target) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. The default scoring function for TPOTRegressor is 'mean_squared_error'. Parameters: testing_features : array-like {n_samples, n_features} Feature matrix of the testing set testing_target : array-like {n_samples} List of target labels for prediction in the testing set Returns: accuracy_score : float The estimated test set accuracy according to the user-specified scoring function. export(output_file_name) Export the optimized pipeline as Python code. See the usage documentation for example usage of the export function. Parameters: output_file_name : string String containing the path and file name of the desired output file data_file_path : string By default, the path of input dataset is 'PATH/TO/DATA/FILE' by default. If data_file_path is another string, the path will be replaced. Returns: exported_code_string : string The whole pipeline text as a string should be returned if output_file_name is not specified.","title":"TPOT API"},{"location":"api/#tpot-api","text":"","title":"TPOT API"},{"location":"api/#classification","text":"class tpot. TPOTClassifier ( generations =100, population_size =100, offspring_size =None, mutation_rate =0.9, crossover_rate =0.1, scoring ='accuracy', cv =5, subsample =1.0, n_jobs =1, max_time_mins =None, max_eval_time_mins =5, random_state =None, config_dict =None, template =None, warm_start =False, memory =None, use_dask =False, periodic_checkpoint_folder =None, early_stop =None, verbosity =0, disable_update_check =False, log_file =None ) source Automated machine learning for supervised classification tasks. The TPOTClassifier performs an intelligent search over machine learning pipelines that can contain supervised classification models, preprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API . The TPOTClassifier will also search over the hyperparameters of all objects in the pipeline. By default, TPOTClassifier will search over a broad range of supervised classification algorithms, transformers, and their parameters. However, the algorithms, transformers, and hyperparameters that the TPOTClassifier searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int or None optional (default=100) Number of iterations to the run pipeline optimization process. It must be a positive number or None. If None, the parameter max_time_mins must be defined as the runtime limit. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) Number of individuals to retain in the genetic programming population every generation. Must be a positive number. Generally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=None) Number of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size. mutation_rate : float, optional (default=0.9) Mutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) Crossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='accuracy') Function used to evaluate the quality of a given pipeline for the classification problem. The following built-in scoring functions can be used: 'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'precision' etc. (suffixes apply as with \u2018f1\u2019), 'recall' etc. (suffixes apply as with \u2018f1\u2019), \u2018jaccard\u2019 etc. (suffixes apply as with \u2018f1\u2019), 'roc_auc', \u2018roc_auc_ovr\u2019, \u2018roc_auc_ovo\u2019, \u2018roc_auc_ovr_weighted\u2019, \u2018roc_auc_ovo_weighted\u2019 If you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y) . See the section on scoring functions for more details. cv : int, cross-validation generator, or an iterable, optional (default=5) Cross-validation strategy used when evaluating pipelines. Possible inputs: integer, to specify the number of folds in a StratifiedKFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) Fraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. Setting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) Number of processes to use in parallel for evaluating pipelines during the TPOT optimization process. Setting n_jobs =-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets. max_time_mins : integer or None, optional (default=None) How many minutes TPOT has to optimize the pipeline. If not None, this setting will allow TPOT to run until max_time_mins minutes elapsed and then stop. TPOT will stop earlier if generations is set and all generations are already evaluated. max_eval_time_mins : float, optional (default=5) How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) The seed of the pseudo random number generator used in TPOT. Use this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) A configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. Possible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTClassifier configuration. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. template : string (default=None) Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. warm_start : boolean, optional (default=False) Flag indicating whether the TPOT instance will reuse the population from previous calls to fit() . Setting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. memory : a joblib.Memory object or string, optional (default=None) If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in scikit-learn documentation Possible inputs are: String 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or Path of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or Memory object, TPOT uses the instance of joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or None, TPOT does not use memory caching. use_dask : boolean, optional (default: False) Whether to use Dask-ML's pipeline optimiziations. This avoid re-fitting the same estimator on the same split of data multiple times. It will also provide more detailed diagnostics when using Dask's distributed scheduler. See avoid repeated work for more details. periodic_checkpoint_folder : path string, optional (default: None) If supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing. Currently once per generation but not more often than once per 30 seconds. Useful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) How many generations TPOT checks whether there is no improvement in optimization process. Ends the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) How much information TPOT communicates while it's running. Possible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) Flag indicating whether the TPOT version checker should be disabled. The update checker will tell you when a new version of TPOT has been released. log_file : file-like class (io.TextIOWrapper or io.StringIO) or string, optional (default: None) Save progress content to a file. If it is a string for the path and file name of the desired output file, TPOT will create the file and write log into it. If it is None, TPOT will output log into sys.stdout Attributes: fitted_pipeline_ : scikit-learn Pipeline object The best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary Dictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. The TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. Note: pareto_front_fitted_pipelines_ is only available when verbosity =3. evaluated_individuals_ : Python dictionary Dictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). This attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Functions fit (features, classes[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the classes for a feature set. predict_proba (features) Use the optimized pipeline to estimate the class probabilities for a feature set. score (testing_features, testing_classes) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, classes, sample_weight=None, groups=None) Run the TPOT optimization process on the given training data. Uses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples. Parameters: features : array-like {n_samples, n_features} Feature matrix TPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values. As such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed) using median value imputation . If you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. classes : array-like {n_samples} List of class labels for prediction sample_weight : array-like {n_samples}, optional Per-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines. groups : array-like, with shape {n_samples, }, optional Group labels for the samples used when performing cross-validation. This parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object Returns a copy of the fitted TPOT object predict(features) Use the optimized pipeline to predict the classes for a feature set. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples} Predicted classes for the samples in the feature matrix predict_proba(features) Use the optimized pipeline to estimate the class probabilities for a feature set. Note: This function will only work for pipelines whose final classifier supports the predict_proba function. TPOT will raise an error otherwise. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples, n_classes} The class probabilities of the input samples score(testing_features, testing_classes) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. The default scoring function for TPOTClassifier is 'accuracy'. Parameters: testing_features : array-like {n_samples, n_features} Feature matrix of the testing set testing_classes : array-like {n_samples} List of class labels for prediction in the testing set Returns: accuracy_score : float The estimated test set accuracy according to the user-specified scoring function. export(output_file_name, data_file_path) Export the optimized pipeline as Python code. See the usage documentation for example usage of the export function. Parameters: output_file_name : string String containing the path and file name of the desired output file data_file_path : string By default, the path of input dataset is 'PATH/TO/DATA/FILE' by default. If data_file_path is another string, the path will be replaced. Returns: exported_code_string : string The whole pipeline text as a string should be returned if output_file_name is not specified.","title":"Classification"},{"location":"api/#regression","text":"class tpot. TPOTRegressor ( generations =100, population_size =100, offspring_size =None, mutation_rate =0.9, crossover_rate =0.1, scoring ='neg_mean_squared_error', cv =5, subsample =1.0, n_jobs =1, max_time_mins =None, max_eval_time_mins =5, random_state =None, config_dict =None, template =None, warm_start =False, memory =None, use_dask =False, periodic_checkpoint_folder =None, early_stop =None, verbosity =0, disable_update_check =False ) source Automated machine learning for supervised regression tasks. The TPOTRegressor performs an intelligent search over machine learning pipelines that can contain supervised regression models, preprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API . The TPOTRegressor will also search over the hyperparameters of all objects in the pipeline. By default, TPOTRegressor will search over a broad range of supervised regression models, transformers, and their hyperparameters. However, the models, transformers, and parameters that the TPOTRegressor searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int or None, optional (default=100) Number of iterations to the run pipeline optimization process. It must be a positive number or None. If None, the parameter max_time_mins must be defined as the runtime limit. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) Number of individuals to retain in the genetic programming population every generation. Must be a positive number. Generally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=None) Number of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size. mutation_rate : float, optional (default=0.9) Mutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) Crossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='neg_mean_squared_error') Function used to evaluate the quality of a given pipeline for the regression problem. The following built-in scoring functions can be used: 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'r2' Note that we recommend using the neg version of mean squared error and related metrics so TPOT will minimize (instead of maximize) the metric. If you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y) . See the section on scoring functions for more details. cv : int, cross-validation generator, or an iterable, optional (default=5) Cross-validation strategy used when evaluating pipelines. Possible inputs: integer, to specify the number of folds in a KFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) Fraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. Setting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) Number of processes to use in parallel for evaluating pipelines during the TPOT optimization process. Setting n_jobs =-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets max_time_mins : integer or None, optional (default=None) How many minutes TPOT has to optimize the pipeline. If not None, this setting will allow TPOT to run until max_time_mins minutes elapsed and then stop. TPOT will stop earlier if generations is set and all generations are already evaluated. max_eval_time_mins : float, optional (default=5) How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) The seed of the pseudo random number generator used in TPOT. Use this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) A configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. Possible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTRegressor configuration. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. template : string (default=None) Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Regressor\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. warm_start : boolean, optional (default=False) Flag indicating whether the TPOT instance will reuse the population from previous calls to fit() . Setting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. memory : a joblib.Memory object or string, optional (default=None) If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in scikit-learn documentation Possible inputs are: String 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or Path of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or Memory object, TPOT uses the instance of joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or None, TPOT does not use memory caching. use_dask : boolean, optional (default: False) Whether to use Dask-ML's pipeline optimiziations. This avoid re-fitting the same estimator on the same split of data multiple times. It will also provide more detailed diagnostics when using Dask's distributed scheduler. See avoid repeated work for more details. periodic_checkpoint_folder : path string, optional (default: None) If supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing. Currently once per generation but not more often than once per 30 seconds. Useful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) How many generations TPOT checks whether there is no improvement in optimization process. Ends the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) How much information TPOT communicates while it's running. Possible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) Flag indicating whether the TPOT version checker should be disabled. The update checker will tell you when a new version of TPOT has been released. Attributes: fitted_pipeline_ : scikit-learn Pipeline object The best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary Dictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. The TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. Note: _pareto_front_fitted_pipelines is only available when verbosity =3. evaluated_individuals_ : Python dictionary Dictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). This attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTRegressor from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split digits = load_boston() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py') Functions fit (features, target[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the target values for a feature set. score (testing_features, testing_target) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, target, sample_weight=None, groups=None) Run the TPOT optimization process on the given training data. Uses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples. Parameters: features : array-like {n_samples, n_features} Feature matrix TPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values. As such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed) using median value imputation . If you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. target : array-like {n_samples} List of target labels for prediction sample_weight : array-like {n_samples}, optional Per-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines. groups : array-like, with shape {n_samples, }, optional Group labels for the samples used when performing cross-validation. This parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object Returns a copy of the fitted TPOT object predict(features) Use the optimized pipeline to predict the target values for a feature set. Parameters: features : array-like {n_samples, n_features} Feature matrix Returns: predictions : array-like {n_samples} Predicted target values for the samples in the feature matrix score(testing_features, testing_target) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. The default scoring function for TPOTRegressor is 'mean_squared_error'. Parameters: testing_features : array-like {n_samples, n_features} Feature matrix of the testing set testing_target : array-like {n_samples} List of target labels for prediction in the testing set Returns: accuracy_score : float The estimated test set accuracy according to the user-specified scoring function. export(output_file_name) Export the optimized pipeline as Python code. See the usage documentation for example usage of the export function. Parameters: output_file_name : string String containing the path and file name of the desired output file data_file_path : string By default, the path of input dataset is 'PATH/TO/DATA/FILE' by default. If data_file_path is another string, the path will be replaced. Returns: exported_code_string : string The whole pipeline text as a string should be returned if output_file_name is not specified.","title":"Regression"},{"location":"citing/","text":"Citing TPOT If you use TPOT in a scientific publication, please consider citing at least one of the following papers: Trang T. Le, Weixuan Fu and Jason H. Moore (2020). Scaling tree-based automated machine learning to biomedical big data with a feature set selector . Bioinformatics .36(1): 250-256. BibTeX entry: @article{le2020scaling, title={Scaling tree-based automated machine learning to biomedical big data with a feature set selector}, author={Le, Trang T and Fu, Weixuan and Moore, Jason H}, journal={Bioinformatics}, volume={36}, number={1}, pages={250--256}, year={2020}, publisher={Oxford University Press} } Randal S. Olson, Ryan J. Urbanowicz, Peter C. Andrews, Nicole A. Lavender, La Creis Kidd, and Jason H. Moore (2016). Automating biomedical data science through tree-based pipeline optimization . Applications of Evolutionary Computation , pages 123-137. BibTeX entry: @inbook{Olson2016EvoBio, author={Olson, Randal S. and Urbanowicz, Ryan J. and Andrews, Peter C. and Lavender, Nicole A. and Kidd, La Creis and Moore, Jason H.}, editor={Squillero, Giovanni and Burelli, Paolo}, chapter={Automating Biomedical Data Science Through Tree-Based Pipeline Optimization}, title={Applications of Evolutionary Computation: 19th European Conference, EvoApplications 2016, Porto, Portugal, March 30 -- April 1, 2016, Proceedings, Part I}, year={2016}, publisher={Springer International Publishing}, pages={123--137}, isbn={978-3-319-31204-0}, doi={10.1007/978-3-319-31204-0_9}, url={http://dx.doi.org/10.1007/978-3-319-31204-0_9} } Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science Randal S. Olson, Nathan Bartley, Ryan J. Urbanowicz, and Jason H. Moore (2016). Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science . Proceedings of GECCO 2016 , pages 485-492. BibTeX entry: @inproceedings{OlsonGECCO2016, author = {Olson, Randal S. and Bartley, Nathan and Urbanowicz, Ryan J. and Moore, Jason H.}, title = {Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science}, booktitle = {Proceedings of the Genetic and Evolutionary Computation Conference 2016}, series = {GECCO '16}, year = {2016}, isbn = {978-1-4503-4206-3}, location = {Denver, Colorado, USA}, pages = {485--492}, numpages = {8}, url = {http://doi.acm.org/10.1145/2908812.2908918}, doi = {10.1145/2908812.2908918}, acmid = {2908918}, publisher = {ACM}, address = {New York, NY, USA}, } Alternatively, you can cite the repository directly with the following DOI: DOI","title":"Citing TPOT"},{"location":"citing/#citing-tpot","text":"If you use TPOT in a scientific publication, please consider citing at least one of the following papers: Trang T. Le, Weixuan Fu and Jason H. Moore (2020). Scaling tree-based automated machine learning to biomedical big data with a feature set selector . Bioinformatics .36(1): 250-256. BibTeX entry: @article{le2020scaling, title={Scaling tree-based automated machine learning to biomedical big data with a feature set selector}, author={Le, Trang T and Fu, Weixuan and Moore, Jason H}, journal={Bioinformatics}, volume={36}, number={1}, pages={250--256}, year={2020}, publisher={Oxford University Press} } Randal S. Olson, Ryan J. Urbanowicz, Peter C. Andrews, Nicole A. Lavender, La Creis Kidd, and Jason H. Moore (2016). Automating biomedical data science through tree-based pipeline optimization . Applications of Evolutionary Computation , pages 123-137. BibTeX entry: @inbook{Olson2016EvoBio, author={Olson, Randal S. and Urbanowicz, Ryan J. and Andrews, Peter C. and Lavender, Nicole A. and Kidd, La Creis and Moore, Jason H.}, editor={Squillero, Giovanni and Burelli, Paolo}, chapter={Automating Biomedical Data Science Through Tree-Based Pipeline Optimization}, title={Applications of Evolutionary Computation: 19th European Conference, EvoApplications 2016, Porto, Portugal, March 30 -- April 1, 2016, Proceedings, Part I}, year={2016}, publisher={Springer International Publishing}, pages={123--137}, isbn={978-3-319-31204-0}, doi={10.1007/978-3-319-31204-0_9}, url={http://dx.doi.org/10.1007/978-3-319-31204-0_9} } Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science Randal S. Olson, Nathan Bartley, Ryan J. Urbanowicz, and Jason H. Moore (2016). Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science . Proceedings of GECCO 2016 , pages 485-492. BibTeX entry: @inproceedings{OlsonGECCO2016, author = {Olson, Randal S. and Bartley, Nathan and Urbanowicz, Ryan J. and Moore, Jason H.}, title = {Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science}, booktitle = {Proceedings of the Genetic and Evolutionary Computation Conference 2016}, series = {GECCO '16}, year = {2016}, isbn = {978-1-4503-4206-3}, location = {Denver, Colorado, USA}, pages = {485--492}, numpages = {8}, url = {http://doi.acm.org/10.1145/2908812.2908918}, doi = {10.1145/2908812.2908918}, acmid = {2908918}, publisher = {ACM}, address = {New York, NY, USA}, } Alternatively, you can cite the repository directly with the following DOI: DOI","title":"Citing TPOT"},{"location":"contributing/","text":"Contribution Guide We welcome you to check the existing issues for bugs or enhancements to work on. If you have an idea for an extension to TPOT, please file a new issue so we can discuss it. Project layout The latest stable release of TPOT is on the master branch , whereas the latest version of TPOT in development is on the development branch . Make sure you are looking at and working on the correct branch if you're looking to contribute code. In terms of directory structure: All of TPOT's code sources are in the tpot directory The documentation sources are in the docs_sources directory Images in the documentation are in the images directory Tutorials for TPOT are in the tutorials directory Unit tests for TPOT are in the tests.py file Make sure to familiarize yourself with the project layout before making any major contributions, and especially make sure to send all code changes to the development branch. How to contribute The preferred way to contribute to TPOT is to fork the main repository on GitHub: Fork the project repository : click on the 'Fork' button near the top of the page. This creates a copy of the code under your account on the GitHub server. Clone this copy to your local disk: $ git clone git@github.com:YourUsername/tpot.git $ cd tpot Create a branch to hold your changes: $ git checkout -b my-contribution Make sure your local environment is setup correctly for development. Installation instructions are almost identical to the user instructions except that TPOT should not be installed. If you have TPOT installed on your computer then make sure you are using a virtual environment that does not have TPOT installed. Furthermore, you should make sure you have installed the nose package into your development environment so that you can test changes locally. $ conda install nose Start making changes on your newly created branch, remembering to never work on the master branch! Work on this copy on your computer using Git to do the version control. Once some changes are saved locally, you can use your tweaked version of TPOT by navigating to the project's base directory and running TPOT directly from the command line: $ python -m tpot.driver or by running script that imports and uses the TPOT module with code similar to from tpot import TPOTClassifier To check your changes haven't broken any existing tests and to check new tests you've added pass run the following (note, you must have the nose package installed within your dev environment for this to work): $ nosetests -s -v When you're done editing and local testing, run: $ git add modified_files $ git commit to record your changes in Git, then push them to GitHub with: $ git push -u origin my-contribution Finally, go to the web page of your fork of the TPOT repo, and click 'Pull Request' (PR) to send your changes to the maintainers for review. Make sure that you send your PR to the development branch, as the master branch is reserved for the latest stable release. This will start the CI server to check all the project's unit tests run and send an email to the maintainers. (If any of the above seems like magic to you, then look up the Git documentation on the web.) Before submitting your pull request Before you submit a pull request for your contribution, please work through this checklist to make sure that you have done everything necessary so we can efficiently review and accept your changes. If your contribution changes TPOT in any way: Update the documentation so all of your changes are reflected there. Update the README if anything there has changed. If your contribution involves any code changes: Update the project unit tests to test your code changes. Make sure that your code is properly commented with docstrings and comments explaining your rationale behind non-obvious coding practices. If your code affected any of the pipeline operators, make sure that the corresponding export functionality reflects those changes. If your contribution requires a new library dependency: Double-check that the new dependency is easy to install via pip or Anaconda and supports both Python 2 and 3. If the dependency requires a complicated installation, then we most likely won't merge your changes because we want to keep TPOT easy to install. Add the required version of the library to .travis.yml Add a line to pip install the library to .travis_install.sh Add a line to print the version of the library to .travis_install.sh Similarly add a line to print the version of the library to .travis_test.sh After submitting your pull request After submitting your pull request, Travis-CI will automatically run unit tests on your changes and make sure that your updated code builds and runs on Python 2 and 3. We also use services that automatically check code quality and test coverage. Check back shortly after submitting your pull request to make sure that your code passes these checks. If any of the checks come back with a red X, then do your best to address the errors.","title":"Contributing"},{"location":"contributing/#contribution-guide","text":"We welcome you to check the existing issues for bugs or enhancements to work on. If you have an idea for an extension to TPOT, please file a new issue so we can discuss it.","title":"Contribution Guide"},{"location":"contributing/#project-layout","text":"The latest stable release of TPOT is on the master branch , whereas the latest version of TPOT in development is on the development branch . Make sure you are looking at and working on the correct branch if you're looking to contribute code. In terms of directory structure: All of TPOT's code sources are in the tpot directory The documentation sources are in the docs_sources directory Images in the documentation are in the images directory Tutorials for TPOT are in the tutorials directory Unit tests for TPOT are in the tests.py file Make sure to familiarize yourself with the project layout before making any major contributions, and especially make sure to send all code changes to the development branch.","title":"Project layout"},{"location":"contributing/#how-to-contribute","text":"The preferred way to contribute to TPOT is to fork the main repository on GitHub: Fork the project repository : click on the 'Fork' button near the top of the page. This creates a copy of the code under your account on the GitHub server. Clone this copy to your local disk: $ git clone git@github.com:YourUsername/tpot.git $ cd tpot Create a branch to hold your changes: $ git checkout -b my-contribution Make sure your local environment is setup correctly for development. Installation instructions are almost identical to the user instructions except that TPOT should not be installed. If you have TPOT installed on your computer then make sure you are using a virtual environment that does not have TPOT installed. Furthermore, you should make sure you have installed the nose package into your development environment so that you can test changes locally. $ conda install nose Start making changes on your newly created branch, remembering to never work on the master branch! Work on this copy on your computer using Git to do the version control. Once some changes are saved locally, you can use your tweaked version of TPOT by navigating to the project's base directory and running TPOT directly from the command line: $ python -m tpot.driver or by running script that imports and uses the TPOT module with code similar to from tpot import TPOTClassifier To check your changes haven't broken any existing tests and to check new tests you've added pass run the following (note, you must have the nose package installed within your dev environment for this to work): $ nosetests -s -v When you're done editing and local testing, run: $ git add modified_files $ git commit to record your changes in Git, then push them to GitHub with: $ git push -u origin my-contribution Finally, go to the web page of your fork of the TPOT repo, and click 'Pull Request' (PR) to send your changes to the maintainers for review. Make sure that you send your PR to the development branch, as the master branch is reserved for the latest stable release. This will start the CI server to check all the project's unit tests run and send an email to the maintainers. (If any of the above seems like magic to you, then look up the Git documentation on the web.)","title":"How to contribute"},{"location":"contributing/#before-submitting-your-pull-request","text":"Before you submit a pull request for your contribution, please work through this checklist to make sure that you have done everything necessary so we can efficiently review and accept your changes. If your contribution changes TPOT in any way: Update the documentation so all of your changes are reflected there. Update the README if anything there has changed. If your contribution involves any code changes: Update the project unit tests to test your code changes. Make sure that your code is properly commented with docstrings and comments explaining your rationale behind non-obvious coding practices. If your code affected any of the pipeline operators, make sure that the corresponding export functionality reflects those changes. If your contribution requires a new library dependency: Double-check that the new dependency is easy to install via pip or Anaconda and supports both Python 2 and 3. If the dependency requires a complicated installation, then we most likely won't merge your changes because we want to keep TPOT easy to install. Add the required version of the library to .travis.yml Add a line to pip install the library to .travis_install.sh Add a line to print the version of the library to .travis_install.sh Similarly add a line to print the version of the library to .travis_test.sh","title":"Before submitting your pull request"},{"location":"contributing/#after-submitting-your-pull-request","text":"After submitting your pull request, Travis-CI will automatically run unit tests on your changes and make sure that your updated code builds and runs on Python 2 and 3. We also use services that automatically check code quality and test coverage. Check back shortly after submitting your pull request to make sure that your code passes these checks. If any of the checks come back with a red X, then do your best to address the errors.","title":"After submitting your pull request"},{"location":"examples/","text":"Overview The following sections illustrate the usage of TPOT with various datasets, each belonging to a typical class of machine learning tasks. Dataset Task Task class Dataset description Jupyter notebook Iris flower classification classification link link Optical Recognition of Handwritten Digits digit recognition (image) classification link link Boston housing prices modeling regression link N/A Titanic survival analysis classification link link Bank Marketing subscription prediction classification link link MAGIC Gamma Telescope event detection classification link link cuML Classification Example random classification problem classification link link cuML Regression Example random regression problem regression link link Notes: - For details on how the fit() , score() and export() methods work, refer to the usage documentation . - Upon re-running the experiments, your resulting pipelines may differ (to some extent) from the ones demonstrated here. Iris flower classification The following code illustrates how TPOT can be employed for performing a simple classification task over the Iris dataset. from tpot import TPOTClassifier from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split import numpy as np iris = load_iris() X_train, X_test, y_train, y_test = train_test_split(iris.data.astype(np.float64), iris.target.astype(np.float64), train_size=0.75, test_size=0.25, random_state=42) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, random_state=42) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_iris_pipeline.py') Running this code should discover a pipeline (exported as tpot_iris_pipeline.py ) that achieves about 97% test accuracy: import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Normalizer from tpot.export_utils import set_param_recursive # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'], random_state=42) # Average CV score on the training set was: 0.9826086956521738 exported_pipeline = make_pipeline( Normalizer(norm=\"l2\"), KNeighborsClassifier(n_neighbors=5, p=2, weights=\"distance\") ) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 42) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) Digits dataset Below is a minimal working example with the optical recognition of handwritten digits dataset, which is an image classification problem . from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25, random_state=42) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, random_state=42) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Running this code should discover a pipeline (exported as tpot_digits_pipeline.py ) that achieves about 98% test accuracy: import numpy as np import pandas as pd from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import PolynomialFeatures from tpot.builtins import StackingEstimator from tpot.export_utils import set_param_recursive # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'], random_state=42) # Average CV score on the training set was: 0.9799428471757372 exported_pipeline = make_pipeline( PolynomialFeatures(degree=2, include_bias=False, interaction_only=False), StackingEstimator(estimator=LogisticRegression(C=0.1, dual=False, penalty=\"l1\")), RandomForestClassifier(bootstrap=True, criterion=\"entropy\", max_features=0.35000000000000003, min_samples_leaf=20, min_samples_split=19, n_estimators=100) ) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 42) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) Boston housing prices modeling The following code illustrates how TPOT can be employed for performing a regression task over the Boston housing prices dataset. from tpot import TPOTRegressor from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split housing = load_boston() X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, train_size=0.75, test_size=0.25, random_state=42) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2, random_state=42) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py') Running this code should discover a pipeline (exported as tpot_boston_pipeline.py ) that achieves at least 10 mean squared error (MSE) on the test set: import numpy as np import pandas as pd from sklearn.ensemble import ExtraTreesRegressor from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import PolynomialFeatures from tpot.export_utils import set_param_recursive # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'], random_state=42) # Average CV score on the training set was: -10.812040755234403 exported_pipeline = make_pipeline( PolynomialFeatures(degree=2, include_bias=False, interaction_only=False), ExtraTreesRegressor(bootstrap=False, max_features=0.5, min_samples_leaf=2, min_samples_split=3, n_estimators=100) ) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 42) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) Titanic survival analysis To see the TPOT applied the Titanic Kaggle dataset, see the Jupyter notebook here . This example shows how to take a messy dataset and preprocess it such that it can be used in scikit-learn and TPOT. Portuguese Bank Marketing The corresponding Jupyter notebook, containing the associated data preprocessing and analysis, can be found here . MAGIC Gamma Telescope The corresponding Jupyter notebook, containing the associated data preprocessing and analysis, can be found here . Neural network classifier using TPOT-NN By loading the TPOT-NN configuration dictionary , PyTorch estimators will be included for classification. Users can also create their own NN configuration dictionary that includes tpot.builtins.PytorchLRClassifier and/or tpot.builtins.PytorchMLPClassifier , or they can specify them using a template string, as shown in the following example: from tpot import TPOTClassifier from sklearn.datasets import make_blobs from sklearn.model_selection import train_test_split X, y = make_blobs(n_samples=100, centers=2, n_features=3, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25) clf = TPOTClassifier(config_dict='TPOT NN', template='Selector-Transformer-PytorchLRClassifier', verbosity=2, population_size=10, generations=10) clf.fit(X_train, y_train) print(clf.score(X_test, y_test)) clf.export('tpot_nn_demo_pipeline.py') This example is somewhat trivial, but it should result in nearly 100% classification accuracy.","title":"Examples"},{"location":"examples/#overview","text":"The following sections illustrate the usage of TPOT with various datasets, each belonging to a typical class of machine learning tasks. Dataset Task Task class Dataset description Jupyter notebook Iris flower classification classification link link Optical Recognition of Handwritten Digits digit recognition (image) classification link link Boston housing prices modeling regression link N/A Titanic survival analysis classification link link Bank Marketing subscription prediction classification link link MAGIC Gamma Telescope event detection classification link link cuML Classification Example random classification problem classification link link cuML Regression Example random regression problem regression link link Notes: - For details on how the fit() , score() and export() methods work, refer to the usage documentation . - Upon re-running the experiments, your resulting pipelines may differ (to some extent) from the ones demonstrated here.","title":"Overview"},{"location":"examples/#iris-flower-classification","text":"The following code illustrates how TPOT can be employed for performing a simple classification task over the Iris dataset. from tpot import TPOTClassifier from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split import numpy as np iris = load_iris() X_train, X_test, y_train, y_test = train_test_split(iris.data.astype(np.float64), iris.target.astype(np.float64), train_size=0.75, test_size=0.25, random_state=42) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, random_state=42) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_iris_pipeline.py') Running this code should discover a pipeline (exported as tpot_iris_pipeline.py ) that achieves about 97% test accuracy: import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Normalizer from tpot.export_utils import set_param_recursive # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'], random_state=42) # Average CV score on the training set was: 0.9826086956521738 exported_pipeline = make_pipeline( Normalizer(norm=\"l2\"), KNeighborsClassifier(n_neighbors=5, p=2, weights=\"distance\") ) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 42) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)","title":"Iris flower classification"},{"location":"examples/#digits-dataset","text":"Below is a minimal working example with the optical recognition of handwritten digits dataset, which is an image classification problem . from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25, random_state=42) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, random_state=42) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Running this code should discover a pipeline (exported as tpot_digits_pipeline.py ) that achieves about 98% test accuracy: import numpy as np import pandas as pd from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import PolynomialFeatures from tpot.builtins import StackingEstimator from tpot.export_utils import set_param_recursive # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'], random_state=42) # Average CV score on the training set was: 0.9799428471757372 exported_pipeline = make_pipeline( PolynomialFeatures(degree=2, include_bias=False, interaction_only=False), StackingEstimator(estimator=LogisticRegression(C=0.1, dual=False, penalty=\"l1\")), RandomForestClassifier(bootstrap=True, criterion=\"entropy\", max_features=0.35000000000000003, min_samples_leaf=20, min_samples_split=19, n_estimators=100) ) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 42) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)","title":"Digits dataset"},{"location":"examples/#boston-housing-prices-modeling","text":"The following code illustrates how TPOT can be employed for performing a regression task over the Boston housing prices dataset. from tpot import TPOTRegressor from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split housing = load_boston() X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, train_size=0.75, test_size=0.25, random_state=42) tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2, random_state=42) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_boston_pipeline.py') Running this code should discover a pipeline (exported as tpot_boston_pipeline.py ) that achieves at least 10 mean squared error (MSE) on the test set: import numpy as np import pandas as pd from sklearn.ensemble import ExtraTreesRegressor from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import PolynomialFeatures from tpot.export_utils import set_param_recursive # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'], random_state=42) # Average CV score on the training set was: -10.812040755234403 exported_pipeline = make_pipeline( PolynomialFeatures(degree=2, include_bias=False, interaction_only=False), ExtraTreesRegressor(bootstrap=False, max_features=0.5, min_samples_leaf=2, min_samples_split=3, n_estimators=100) ) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 42) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)","title":"Boston housing prices modeling"},{"location":"examples/#titanic-survival-analysis","text":"To see the TPOT applied the Titanic Kaggle dataset, see the Jupyter notebook here . This example shows how to take a messy dataset and preprocess it such that it can be used in scikit-learn and TPOT.","title":"Titanic survival analysis"},{"location":"examples/#portuguese-bank-marketing","text":"The corresponding Jupyter notebook, containing the associated data preprocessing and analysis, can be found here .","title":"Portuguese Bank Marketing"},{"location":"examples/#magic-gamma-telescope","text":"The corresponding Jupyter notebook, containing the associated data preprocessing and analysis, can be found here .","title":"MAGIC Gamma Telescope"},{"location":"examples/#neural-network-classifier-using-tpot-nn","text":"By loading the TPOT-NN configuration dictionary , PyTorch estimators will be included for classification. Users can also create their own NN configuration dictionary that includes tpot.builtins.PytorchLRClassifier and/or tpot.builtins.PytorchMLPClassifier , or they can specify them using a template string, as shown in the following example: from tpot import TPOTClassifier from sklearn.datasets import make_blobs from sklearn.model_selection import train_test_split X, y = make_blobs(n_samples=100, centers=2, n_features=3, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25) clf = TPOTClassifier(config_dict='TPOT NN', template='Selector-Transformer-PytorchLRClassifier', verbosity=2, population_size=10, generations=10) clf.fit(X_train, y_train) print(clf.score(X_test, y_test)) clf.export('tpot_nn_demo_pipeline.py') This example is somewhat trivial, but it should result in nearly 100% classification accuracy.","title":"Neural network classifier using TPOT-NN"},{"location":"installing/","text":"Installation TPOT is built on top of several existing Python libraries, including: NumPy SciPy scikit-learn DEAP update_checker tqdm stopit pandas joblib Most of the necessary Python packages can be installed via the Anaconda Python distribution , which we strongly recommend that you use. Support for Python 3.4 and below has been officially dropped since version 0.11.0. You can install TPOT using pip or conda-forge . pip NumPy, SciPy, scikit-learn, pandas, joblib, and PyTorch can be installed in Anaconda via the command: conda install numpy scipy scikit-learn pandas joblib pytorch DEAP, update_checker, tqdm and stopit can be installed with pip via the command: pip install deap update_checker tqdm stopit Optionally , you can install XGBoost if you would like TPOT to use the eXtreme Gradient Boosting models. XGBoost is entirely optional, and TPOT will still function normally without XGBoost if you do not have it installed. Windows users: pip installation may not work on some Windows environments, and it may cause unexpected errors. pip install xgboost If you have issues installing XGBoost, check the XGBoost installation documentation . If you plan to use Dask for parallel training, make sure to install dask[delay] and dask[dataframe] and dask_ml . It is noted that dask-ml>=1.7 requires distributed>=2.4.0 and scikit-learn>=0.23.0. pip install dask[delayed] dask[dataframe] dask-ml fsspec>=0.3.3 distributed>=2.10.0 If you plan to use the TPOT-MDR configuration , make sure to install scikit-mdr and scikit-rebate : pip install scikit-mdr skrebate To enable support for PyTorch -based neural networks (TPOT-NN), you will need to install PyTorch. TPOT-NN will work with either CPU or GPU PyTorch, but we strongly recommend using a GPU version, if possible, as CPU PyTorch models tend to train very slowly. We recommend following PyTorch's installation instructions customized for your operating system and Python distribution. Finally to install TPOT itself, run the following command: pip install tpot conda-forge To install tpot and its core dependencies you can use: conda install -c conda-forge tpot To install additional dependencies you can use: conda install -c conda-forge tpot xgboost dask dask-ml scikit-mdr skrebate As mentioned above, we recommend following PyTorch's installation instructions for installing it to enable support for PyTorch -based neural networks (TPOT-NN). Installation for using TPOT-cuML configuration With \"TPOT cuML\" configuration (see built-in configurations ), TPOT will search over a restricted configuration using the GPU-accelerated estimators in RAPIDS cuML and DMLC XGBoost . This configuration requires an NVIDIA Pascal architecture or better GPU with compute capability 6.0+ , and that the library cuML is installed. With this configuration, all model training and predicting will be GPU-accelerated. This configuration is particularly useful for medium-sized and larger datasets on which CPU-based estimators are a common bottleneck, and works for both the TPOTClassifier and TPOTRegressor . Please download this conda environment yml file to install TPOT for using TPOT-cuML configuration. conda env create -f tpot-cuml.yml -n tpot-cuml conda activate tpot-cuml Installation problems Please file a new issue if you run into installation problems.","title":"Installation"},{"location":"installing/#installation","text":"TPOT is built on top of several existing Python libraries, including: NumPy SciPy scikit-learn DEAP update_checker tqdm stopit pandas joblib Most of the necessary Python packages can be installed via the Anaconda Python distribution , which we strongly recommend that you use. Support for Python 3.4 and below has been officially dropped since version 0.11.0. You can install TPOT using pip or conda-forge .","title":"Installation"},{"location":"installing/#pip","text":"NumPy, SciPy, scikit-learn, pandas, joblib, and PyTorch can be installed in Anaconda via the command: conda install numpy scipy scikit-learn pandas joblib pytorch DEAP, update_checker, tqdm and stopit can be installed with pip via the command: pip install deap update_checker tqdm stopit Optionally , you can install XGBoost if you would like TPOT to use the eXtreme Gradient Boosting models. XGBoost is entirely optional, and TPOT will still function normally without XGBoost if you do not have it installed. Windows users: pip installation may not work on some Windows environments, and it may cause unexpected errors. pip install xgboost If you have issues installing XGBoost, check the XGBoost installation documentation . If you plan to use Dask for parallel training, make sure to install dask[delay] and dask[dataframe] and dask_ml . It is noted that dask-ml>=1.7 requires distributed>=2.4.0 and scikit-learn>=0.23.0. pip install dask[delayed] dask[dataframe] dask-ml fsspec>=0.3.3 distributed>=2.10.0 If you plan to use the TPOT-MDR configuration , make sure to install scikit-mdr and scikit-rebate : pip install scikit-mdr skrebate To enable support for PyTorch -based neural networks (TPOT-NN), you will need to install PyTorch. TPOT-NN will work with either CPU or GPU PyTorch, but we strongly recommend using a GPU version, if possible, as CPU PyTorch models tend to train very slowly. We recommend following PyTorch's installation instructions customized for your operating system and Python distribution. Finally to install TPOT itself, run the following command: pip install tpot","title":"pip"},{"location":"installing/#conda-forge","text":"To install tpot and its core dependencies you can use: conda install -c conda-forge tpot To install additional dependencies you can use: conda install -c conda-forge tpot xgboost dask dask-ml scikit-mdr skrebate As mentioned above, we recommend following PyTorch's installation instructions for installing it to enable support for PyTorch -based neural networks (TPOT-NN).","title":"conda-forge"},{"location":"installing/#installation-for-using-tpot-cuml-configuration","text":"With \"TPOT cuML\" configuration (see built-in configurations ), TPOT will search over a restricted configuration using the GPU-accelerated estimators in RAPIDS cuML and DMLC XGBoost . This configuration requires an NVIDIA Pascal architecture or better GPU with compute capability 6.0+ , and that the library cuML is installed. With this configuration, all model training and predicting will be GPU-accelerated. This configuration is particularly useful for medium-sized and larger datasets on which CPU-based estimators are a common bottleneck, and works for both the TPOTClassifier and TPOTRegressor . Please download this conda environment yml file to install TPOT for using TPOT-cuML configuration. conda env create -f tpot-cuml.yml -n tpot-cuml conda activate tpot-cuml","title":"Installation for using TPOT-cuML configuration"},{"location":"installing/#installation-problems","text":"Please file a new issue if you run into installation problems.","title":"Installation problems"},{"location":"related/","text":"Other Automated Machine Learning (AutoML) tools and related projects: Name Language License Description Auto-WEKA Java GPL-v3 Automated model selection and hyper-parameter tuning for Weka models. auto-sklearn Python BSD-3-Clause An automated machine learning toolkit and a drop-in replacement for a scikit-learn estimator. auto_ml Python MIT Automated machine learning for analytics & production. Supports manual feature type declarations. H2O AutoML Java with Python, Scala & R APIs and web GUI Apache 2.0 Automated: data prep, hyperparameter tuning, random grid search and stacked ensembles in a distributed ML platform. devol Python MIT Automated deep neural network design via genetic programming. MLBox Python BSD-3-Clause Accurate hyper-parameter optimization in high-dimensional space with support for distributed computing. Recipe C GPL-v3 Machine-learning pipeline optimization through genetic programming. Uses grammars to define pipeline structure. Xcessiv Python Apache 2.0 A web-based application for quick, scalable, and automated hyper-parameter tuning and stacked ensembling in Python. GAMA Python Apache 2.0 Machine-learning pipeline optimization through asynchronous evaluation based genetic programming.","title":"Related"},{"location":"releases/","text":"Release Notes Version 0.11.6 Fix a bug causing point mutation function does not work properly with using template option Add a new built configuration called \"TPOT cuML\" which TPOT will search over a restricted configuration using the GPU-accelerated estimators in RAPIDS cuML and DMLC XGBoost . This configuration requires an NVIDIA Pascal architecture or better GPU with compute capability 6.0+ , and that the library cuML is installed. Add string path support for log/log_file parameter Fix a bug in version 0.11.5 causing no update in stdout after each generation Fix minor bugs Version 0.11.5 Make Pytorch as an optional dependency Refine installation documentation Version 0.11.4 Add a new built configuration \"TPOT NN\" which includes all operators in \"Default TPOT\" plus additional neural network estimators written in PyTorch (currently tpot.builtins.PytorchLRClassifier and tpot.builtins.PytorchMLPClassifier for classification tasks only) Refine log_file parameter's behavior Version 0.11.3 Fix a bug in TPOTRegressor in v0.11.2 Add -log option in command line interface to save process log to a file. Version 0.11.2 Fix early_stop parameter does not work properly TPOT built-in OneHotEncoder can refit to different datasets Fix the issue that the attribute evaluated_individuals_ cannot record correct generation info. Add a new parameter log_file to output logs to a file instead of sys.stdout Fix some code quality issues and mistakes in documentations Fix minor bugs Version 0.11.1 Fix compatibility issue with scikit-learn v0.22 warm_start now saves both Primitive Sets and evaluated_pipelines_ from previous runs; Fix the error that TPOT assign wrong fitness scores to non-evaluated pipelines (interrupted by max_min_mins or KeyboardInterrupt ) ; Fix the bug that mutation operator cannot generate new pipeline when template is not default value and warm_start is True; Fix the bug that max_time_mins cannot stop optimization process when search space is limited. Fix a bug in exported codes when the exported pipeline is only 1 estimator Fix spelling mistakes in documentations Fix some code quality issues Version 0.11.0 Support for Python 3.4 and below has been officially dropped. Also support for scikit-learn 0.20 or below has been dropped. The support of a metric function with the signature score_func(y_true, y_pred) for scoring parameter has been dropped. Refine StackingEstimator for not stacking NaN/Infinity predication probabilities. Fix a bug that population doesn't persist by warm_start=True when max_time_mins is not default value. Now the random_state parameter in TPOT is used for pipeline evaluation instead of using a fixed random seed of 42 before. The set_param_recursive function has been moved to export_utils.py and it can be used in exported codes for setting random_state recursively in scikit-learn Pipeline. It is used to set random_state in fitted_pipeline_ attribute and exported pipelines. TPOT can independently use generations and max_time_mins to limit the optimization process through using one of the parameters or both. .export() function will return string of exported pipeline if output filename is not specified. Add SGDClassifier and SGDRegressor into TPOT default configs. Documentation has been updated Fix minor bugs. Version 0.10.2 TPOT v0.10.2 is the last version to support Python 2.7 and Python 3.4. Minor updates for fixing compatibility issues with the latest version of scikit-learn (version > 0.21) and xgboost (v0.90) Default value of template parameter is changed to None instead. Fix errors in documentation Version 0.10.1 Add data_file_path option into expert function for replacing 'PATH/TO/DATA/FILE' to customized dataset path in exported scripts. (Related issue #838) Change python version in CI tests to 3.7 Add CI tests for macOS. Version 0.10.0 Add a new template option to specify a desired structure for machine learning pipeline in TPOT. Check TPOT API (it will be updated once it is merge to master branch). Add FeatureSetSelector operator into TPOT for feature selection based on priori export knowledge. Please check our preprint paper for more details ( Note: it was named DatasetSelector in 1st version paper but we will rename to FeatureSetSelector in next version of the paper ) Refine n_jobs parameter to accept value below -1. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Now memory parameter can create memory cache directory if it does not exist. Fix minor bugs. Version 0.9.6 Fix a bug causing that max_time_mins parameter doesn't work when use_dask=True in TPOT 0.9.5 Now TPOT saves best pareto values best pareto pipeline s in checkpoint folder TPOT raises ImportError if operators in the TPOT configuration are not available when verbosity>2 Thank @PGijsbers for the suggestions. Now TPOT can save scores of individuals already evaluated in any generation even the evaluation process of that generation is interrupted/stopped. But it is noted that, in this case, TPOT will raise this warning message : WARNING: TPOT may not provide a good pipeline if TPOT is stopped/interrupted in a early generation. , because the pipelines in early generation, e.g. 1st generation, are evolved/modified very limited times via evolutionary algorithm. Fix bugs in configuration of TPOTRegressor Error fixes in documentation Version 0.9.5 TPOT now supports integration with Dask for parallelization + smart caching . Big thanks to the Dask dev team for making this happen! TPOT now supports for imputation/sparse matrices into predict and predict_proba functions. TPOTClassifier and TPOTRegressor now follows scikit-learn estimator API. We refined scoring parameter in TPOT API for accepting Scorer object . We refined parameters in VarianceThreshold and FeatureAgglomeration. TPOT now supports using memory caching within a Pipeline via an optional memory parameter. We improved documentation of TPOT. Version 0.9 TPOT now supports sparse matrices with a new built-in TPOT configuration, \"TPOT sparse\". We are using a custom OneHotEncoder implementation that supports missing values and continuous features. We have added an \"early stopping\" option for stopping the optimization process if no improvement is made within a set number of generations. Look up the early_stop parameter to access this functionality. TPOT now reduces the number of duplicated pipelines between generations, which saves you time during the optimization process. TPOT now supports custom scoring functions via the command-line mode. We have added a new optional argument, periodic_checkpoint_folder , that allows TPOT to periodically save the best pipeline so far to a local folder during optimization process. TPOT no longer uses sklearn.externals.joblib when n_jobs=1 to avoid the potential freezing issue that scikit-learn suffers from . We have added pandas as a dependency to read input datasets instead of numpy.recfromcsv . NumPy's recfromcsv function is unable to parse datasets with complex data types. Fixed a bug that DEFAULT in the parameter(s) of nested estimator raises KeyError when exporting pipelines. Fixed a bug related to setting random_state in nested estimators. The issue would happen with pipeline with SelectFromModel ( ExtraTreesClassifier as nested estimator) or StackingEstimator if nested estimator has random_state parameter. Fixed a bug in the missing value imputation function in TPOT to impute along columns instead rows. Refined input checking for sparse matrices in TPOT. Refined the TPOT pipeline mutation operator. Version 0.8 TPOT now detects whether there are missing values in your dataset and replaces them with the median value of the column. TPOT now allows you to set a group parameter in the fit function so you can use the GroupKFold cross-validation strategy. TPOT now allows you to set a subsample ratio of the training instance with the subsample parameter. For example, setting subsample =0.5 tells TPOT to create a fixed subsample of half of the training data for the pipeline optimization process. This parameter can be useful for speeding up the pipeline optimization process, but may give less accurate performance estimates from cross-validation. TPOT now has more built-in configurations , including TPOT MDR and TPOT light, for both classification and regression problems. TPOTClassifier and TPOTRegressor now expose three useful internal attributes, fitted_pipeline_ , pareto_front_fitted_pipelines_ , and evaluated_individuals_ . These attributes are described in the API documentation . Oh, TPOT now has thorough API documentation . Check it out! Fixed a reproducibility issue where setting random_seed didn't necessarily result in the same results every time. This bug was present since TPOT v0.7. Refined input checking in TPOT. Removed Python 2 uncompliant code. Version 0.7 TPOT now has multiprocessing support. TPOT allows you to use multiple processes in parallel to accelerate the pipeline optimization process in TPOT with the n_jobs parameter. TPOT now allows you to customize the operators and parameters considered during the optimization process , which can be accomplished with the new config_dict parameter. The format of this customized dictionary can be found in the online documentation , along with a list of built-in configurations . TPOT now allows you to specify a time limit for evaluating a single pipeline (default limit is 5 minutes) in optimization process with the max_eval_time_mins parameter, so TPOT won't spend hours evaluating overly-complex pipelines. We tweaked TPOT's underlying evolutionary optimization algorithm to work even better, including using the mu+lambda algorithm . This algorithm gives you more control of how many pipelines are generated every iteration with the offspring_size parameter. Refined the default operators and parameters in TPOT, so TPOT 0.7 should work even better than 0.6. TPOT now supports sample weights in the fitness function if some if your samples are more important to classify correctly than others. The sample weights option works the same as in scikit-learn, e.g., tpot.fit(x_train, y_train, sample_weights=sample_weights) . The default scoring metric in TPOT has been changed from balanced accuracy to accuracy, the same default metric for classification algorithms in scikit-learn. Balanced accuracy can still be used by setting scoring='balanced_accuracy' when creating a TPOT instance. Version 0.6 TPOT now supports regression problems! We have created two separate TPOTClassifier and TPOTRegressor classes to support classification and regression problems, respectively. The command-line interface also supports this feature through the -mode parameter. TPOT now allows you to specify a time limit for the optimization process with the max_time_mins parameter, so you don't need to guess how long TPOT will take any more to recommend a pipeline to you. Added a new operator that performs feature selection using ExtraTrees feature importance scores. XGBoost has been added as an optional dependency to TPOT. If you have XGBoost installed, TPOT will automatically detect your installation and use the XGBoostClassifier and XGBoostRegressor in its pipelines. TPOT now offers a verbosity level of 3 (\"science mode\"), which outputs the entire Pareto front instead of only the current best score. This feature may be useful for users looking to make a trade-off between pipeline complexity and score. Version 0.5 Major refactor: Each operator is defined in a separate class file. Hooray for easier-to-maintain code! TPOT now exports directly to scikit-learn Pipelines instead of hacky code. Internal representation of individuals now uses scikit-learn pipelines. Parameters for each operator have been optimized so TPOT spends less time exploring useless parameters. We have removed pandas as a dependency and instead use numpy matrices to store the data. TPOT now uses k-fold cross-validation when evaluating pipelines, with a default k = 3. This k parameter can be tuned when creating a new TPOT instance. Improved scoring function support : Even though TPOT uses balanced accuracy by default, you can now have TPOT use any of the scoring functions that cross_val_score supports. Added the scikit-learn Normalizer preprocessor. Minor text fixes. Version 0.4 In TPOT 0.4, we've made some major changes to the internals of TPOT and added some convenience functions. We've summarized the changes below. Added new sklearn models and preprocessors AdaBoostClassifier BernoulliNB ExtraTreesClassifier GaussianNB MultinomialNB LinearSVC PassiveAggressiveClassifier GradientBoostingClassifier RBFSampler FastICA FeatureAgglomeration Nystroem Added operator that inserts virtual features for the count of features with values of zero Reworked parameterization of TPOT operators Reduced parameter search space with information from a scikit-learn benchmark TPOT no longer generates arbitrary parameter values, but uses a fixed parameter set instead Removed XGBoost as a dependency Too many users were having install issues with XGBoost Replaced with scikit-learn's GradientBoostingClassifier Improved descriptiveness of TPOT command line parameter documentation Removed min/max/avg details during fit() when verbosity > 1 Replaced with tqdm progress bar Added tqdm as a dependency Added fit_predict() convenience function Added get_params() function so TPOT can operate in scikit-learn's cross_val_score & related functions Version 0.3 We revised the internal optimization process of TPOT to make it more efficient, in particular in regards to the model parameters that TPOT optimizes over. Version 0.2 TPOT now has the ability to export the optimized pipelines to sklearn code. Logistic regression, SVM, and k-nearest neighbors classifiers were added as pipeline operators. Previously, TPOT only included decision tree and random forest classifiers. TPOT can now use arbitrary scoring functions for the optimization process. TPOT now performs multi-objective Pareto optimization to balance model complexity (i.e., # of pipeline operators) and the score of the pipeline. Version 0.1 First public release of TPOT. Optimizes pipelines with decision trees and random forest classifiers as the model, and uses a handful of feature preprocessors.","title":"Release Notes"},{"location":"releases/#release-notes","text":"","title":"Release Notes"},{"location":"releases/#version-0116","text":"Fix a bug causing point mutation function does not work properly with using template option Add a new built configuration called \"TPOT cuML\" which TPOT will search over a restricted configuration using the GPU-accelerated estimators in RAPIDS cuML and DMLC XGBoost . This configuration requires an NVIDIA Pascal architecture or better GPU with compute capability 6.0+ , and that the library cuML is installed. Add string path support for log/log_file parameter Fix a bug in version 0.11.5 causing no update in stdout after each generation Fix minor bugs","title":"Version 0.11.6"},{"location":"releases/#version-0115","text":"Make Pytorch as an optional dependency Refine installation documentation","title":"Version 0.11.5"},{"location":"releases/#version-0114","text":"Add a new built configuration \"TPOT NN\" which includes all operators in \"Default TPOT\" plus additional neural network estimators written in PyTorch (currently tpot.builtins.PytorchLRClassifier and tpot.builtins.PytorchMLPClassifier for classification tasks only) Refine log_file parameter's behavior","title":"Version 0.11.4"},{"location":"releases/#version-0113","text":"Fix a bug in TPOTRegressor in v0.11.2 Add -log option in command line interface to save process log to a file.","title":"Version 0.11.3"},{"location":"releases/#version-0112","text":"Fix early_stop parameter does not work properly TPOT built-in OneHotEncoder can refit to different datasets Fix the issue that the attribute evaluated_individuals_ cannot record correct generation info. Add a new parameter log_file to output logs to a file instead of sys.stdout Fix some code quality issues and mistakes in documentations Fix minor bugs","title":"Version 0.11.2"},{"location":"releases/#version-0111","text":"Fix compatibility issue with scikit-learn v0.22 warm_start now saves both Primitive Sets and evaluated_pipelines_ from previous runs; Fix the error that TPOT assign wrong fitness scores to non-evaluated pipelines (interrupted by max_min_mins or KeyboardInterrupt ) ; Fix the bug that mutation operator cannot generate new pipeline when template is not default value and warm_start is True; Fix the bug that max_time_mins cannot stop optimization process when search space is limited. Fix a bug in exported codes when the exported pipeline is only 1 estimator Fix spelling mistakes in documentations Fix some code quality issues","title":"Version 0.11.1"},{"location":"releases/#version-0110","text":"Support for Python 3.4 and below has been officially dropped. Also support for scikit-learn 0.20 or below has been dropped. The support of a metric function with the signature score_func(y_true, y_pred) for scoring parameter has been dropped. Refine StackingEstimator for not stacking NaN/Infinity predication probabilities. Fix a bug that population doesn't persist by warm_start=True when max_time_mins is not default value. Now the random_state parameter in TPOT is used for pipeline evaluation instead of using a fixed random seed of 42 before. The set_param_recursive function has been moved to export_utils.py and it can be used in exported codes for setting random_state recursively in scikit-learn Pipeline. It is used to set random_state in fitted_pipeline_ attribute and exported pipelines. TPOT can independently use generations and max_time_mins to limit the optimization process through using one of the parameters or both. .export() function will return string of exported pipeline if output filename is not specified. Add SGDClassifier and SGDRegressor into TPOT default configs. Documentation has been updated Fix minor bugs.","title":"Version 0.11.0"},{"location":"releases/#version-0102","text":"TPOT v0.10.2 is the last version to support Python 2.7 and Python 3.4. Minor updates for fixing compatibility issues with the latest version of scikit-learn (version > 0.21) and xgboost (v0.90) Default value of template parameter is changed to None instead. Fix errors in documentation","title":"Version 0.10.2"},{"location":"releases/#version-0101","text":"Add data_file_path option into expert function for replacing 'PATH/TO/DATA/FILE' to customized dataset path in exported scripts. (Related issue #838) Change python version in CI tests to 3.7 Add CI tests for macOS.","title":"Version 0.10.1"},{"location":"releases/#version-0100","text":"Add a new template option to specify a desired structure for machine learning pipeline in TPOT. Check TPOT API (it will be updated once it is merge to master branch). Add FeatureSetSelector operator into TPOT for feature selection based on priori export knowledge. Please check our preprint paper for more details ( Note: it was named DatasetSelector in 1st version paper but we will rename to FeatureSetSelector in next version of the paper ) Refine n_jobs parameter to accept value below -1. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Now memory parameter can create memory cache directory if it does not exist. Fix minor bugs.","title":"Version 0.10.0"},{"location":"releases/#version-096","text":"Fix a bug causing that max_time_mins parameter doesn't work when use_dask=True in TPOT 0.9.5 Now TPOT saves best pareto values best pareto pipeline s in checkpoint folder TPOT raises ImportError if operators in the TPOT configuration are not available when verbosity>2 Thank @PGijsbers for the suggestions. Now TPOT can save scores of individuals already evaluated in any generation even the evaluation process of that generation is interrupted/stopped. But it is noted that, in this case, TPOT will raise this warning message : WARNING: TPOT may not provide a good pipeline if TPOT is stopped/interrupted in a early generation. , because the pipelines in early generation, e.g. 1st generation, are evolved/modified very limited times via evolutionary algorithm. Fix bugs in configuration of TPOTRegressor Error fixes in documentation","title":"Version 0.9.6"},{"location":"releases/#version-095","text":"TPOT now supports integration with Dask for parallelization + smart caching . Big thanks to the Dask dev team for making this happen! TPOT now supports for imputation/sparse matrices into predict and predict_proba functions. TPOTClassifier and TPOTRegressor now follows scikit-learn estimator API. We refined scoring parameter in TPOT API for accepting Scorer object . We refined parameters in VarianceThreshold and FeatureAgglomeration. TPOT now supports using memory caching within a Pipeline via an optional memory parameter. We improved documentation of TPOT.","title":"Version 0.9.5"},{"location":"releases/#version-09","text":"TPOT now supports sparse matrices with a new built-in TPOT configuration, \"TPOT sparse\". We are using a custom OneHotEncoder implementation that supports missing values and continuous features. We have added an \"early stopping\" option for stopping the optimization process if no improvement is made within a set number of generations. Look up the early_stop parameter to access this functionality. TPOT now reduces the number of duplicated pipelines between generations, which saves you time during the optimization process. TPOT now supports custom scoring functions via the command-line mode. We have added a new optional argument, periodic_checkpoint_folder , that allows TPOT to periodically save the best pipeline so far to a local folder during optimization process. TPOT no longer uses sklearn.externals.joblib when n_jobs=1 to avoid the potential freezing issue that scikit-learn suffers from . We have added pandas as a dependency to read input datasets instead of numpy.recfromcsv . NumPy's recfromcsv function is unable to parse datasets with complex data types. Fixed a bug that DEFAULT in the parameter(s) of nested estimator raises KeyError when exporting pipelines. Fixed a bug related to setting random_state in nested estimators. The issue would happen with pipeline with SelectFromModel ( ExtraTreesClassifier as nested estimator) or StackingEstimator if nested estimator has random_state parameter. Fixed a bug in the missing value imputation function in TPOT to impute along columns instead rows. Refined input checking for sparse matrices in TPOT. Refined the TPOT pipeline mutation operator.","title":"Version 0.9"},{"location":"releases/#version-08","text":"TPOT now detects whether there are missing values in your dataset and replaces them with the median value of the column. TPOT now allows you to set a group parameter in the fit function so you can use the GroupKFold cross-validation strategy. TPOT now allows you to set a subsample ratio of the training instance with the subsample parameter. For example, setting subsample =0.5 tells TPOT to create a fixed subsample of half of the training data for the pipeline optimization process. This parameter can be useful for speeding up the pipeline optimization process, but may give less accurate performance estimates from cross-validation. TPOT now has more built-in configurations , including TPOT MDR and TPOT light, for both classification and regression problems. TPOTClassifier and TPOTRegressor now expose three useful internal attributes, fitted_pipeline_ , pareto_front_fitted_pipelines_ , and evaluated_individuals_ . These attributes are described in the API documentation . Oh, TPOT now has thorough API documentation . Check it out! Fixed a reproducibility issue where setting random_seed didn't necessarily result in the same results every time. This bug was present since TPOT v0.7. Refined input checking in TPOT. Removed Python 2 uncompliant code.","title":"Version 0.8"},{"location":"releases/#version-07","text":"TPOT now has multiprocessing support. TPOT allows you to use multiple processes in parallel to accelerate the pipeline optimization process in TPOT with the n_jobs parameter. TPOT now allows you to customize the operators and parameters considered during the optimization process , which can be accomplished with the new config_dict parameter. The format of this customized dictionary can be found in the online documentation , along with a list of built-in configurations . TPOT now allows you to specify a time limit for evaluating a single pipeline (default limit is 5 minutes) in optimization process with the max_eval_time_mins parameter, so TPOT won't spend hours evaluating overly-complex pipelines. We tweaked TPOT's underlying evolutionary optimization algorithm to work even better, including using the mu+lambda algorithm . This algorithm gives you more control of how many pipelines are generated every iteration with the offspring_size parameter. Refined the default operators and parameters in TPOT, so TPOT 0.7 should work even better than 0.6. TPOT now supports sample weights in the fitness function if some if your samples are more important to classify correctly than others. The sample weights option works the same as in scikit-learn, e.g., tpot.fit(x_train, y_train, sample_weights=sample_weights) . The default scoring metric in TPOT has been changed from balanced accuracy to accuracy, the same default metric for classification algorithms in scikit-learn. Balanced accuracy can still be used by setting scoring='balanced_accuracy' when creating a TPOT instance.","title":"Version 0.7"},{"location":"releases/#version-06","text":"TPOT now supports regression problems! We have created two separate TPOTClassifier and TPOTRegressor classes to support classification and regression problems, respectively. The command-line interface also supports this feature through the -mode parameter. TPOT now allows you to specify a time limit for the optimization process with the max_time_mins parameter, so you don't need to guess how long TPOT will take any more to recommend a pipeline to you. Added a new operator that performs feature selection using ExtraTrees feature importance scores. XGBoost has been added as an optional dependency to TPOT. If you have XGBoost installed, TPOT will automatically detect your installation and use the XGBoostClassifier and XGBoostRegressor in its pipelines. TPOT now offers a verbosity level of 3 (\"science mode\"), which outputs the entire Pareto front instead of only the current best score. This feature may be useful for users looking to make a trade-off between pipeline complexity and score.","title":"Version 0.6"},{"location":"releases/#version-05","text":"Major refactor: Each operator is defined in a separate class file. Hooray for easier-to-maintain code! TPOT now exports directly to scikit-learn Pipelines instead of hacky code. Internal representation of individuals now uses scikit-learn pipelines. Parameters for each operator have been optimized so TPOT spends less time exploring useless parameters. We have removed pandas as a dependency and instead use numpy matrices to store the data. TPOT now uses k-fold cross-validation when evaluating pipelines, with a default k = 3. This k parameter can be tuned when creating a new TPOT instance. Improved scoring function support : Even though TPOT uses balanced accuracy by default, you can now have TPOT use any of the scoring functions that cross_val_score supports. Added the scikit-learn Normalizer preprocessor. Minor text fixes.","title":"Version 0.5"},{"location":"releases/#version-04","text":"In TPOT 0.4, we've made some major changes to the internals of TPOT and added some convenience functions. We've summarized the changes below. Added new sklearn models and preprocessors AdaBoostClassifier BernoulliNB ExtraTreesClassifier GaussianNB MultinomialNB LinearSVC PassiveAggressiveClassifier GradientBoostingClassifier RBFSampler FastICA FeatureAgglomeration Nystroem Added operator that inserts virtual features for the count of features with values of zero Reworked parameterization of TPOT operators Reduced parameter search space with information from a scikit-learn benchmark TPOT no longer generates arbitrary parameter values, but uses a fixed parameter set instead Removed XGBoost as a dependency Too many users were having install issues with XGBoost Replaced with scikit-learn's GradientBoostingClassifier Improved descriptiveness of TPOT command line parameter documentation Removed min/max/avg details during fit() when verbosity > 1 Replaced with tqdm progress bar Added tqdm as a dependency Added fit_predict() convenience function Added get_params() function so TPOT can operate in scikit-learn's cross_val_score & related functions","title":"Version 0.4"},{"location":"releases/#version-03","text":"We revised the internal optimization process of TPOT to make it more efficient, in particular in regards to the model parameters that TPOT optimizes over.","title":"Version 0.3"},{"location":"releases/#version-02","text":"TPOT now has the ability to export the optimized pipelines to sklearn code. Logistic regression, SVM, and k-nearest neighbors classifiers were added as pipeline operators. Previously, TPOT only included decision tree and random forest classifiers. TPOT can now use arbitrary scoring functions for the optimization process. TPOT now performs multi-objective Pareto optimization to balance model complexity (i.e., # of pipeline operators) and the score of the pipeline.","title":"Version 0.2"},{"location":"releases/#version-01","text":"First public release of TPOT. Optimizes pipelines with decision trees and random forest classifiers as the model, and uses a handful of feature preprocessors.","title":"Version 0.1"},{"location":"support/","text":"TPOT was developed in the Computational Genetics Lab at the University of Pennsylvania with funding from the NIH under grant R01 AI117694. We are incredibly grateful for the support of the NIH and the University of Pennsylvania during the development of this project. The TPOT logo was designed by Todd Newmuis, who generously donated his time to the project.","title":"Support"},{"location":"using/","text":"Using TPOT What to expect from AutoML software Automated machine learning (AutoML) takes a higher-level approach to machine learning than most practitioners are used to, so we've gathered a handful of guidelines on what to expect when running AutoML software such as TPOT. AutoML algorithms aren't intended to run for only a few minutes Of course, you can run TPOT for only a few minutes and it will find a reasonably good pipeline for your dataset. However, if you don't run TPOT for long enough, it may not find the best possible pipeline for your dataset. It may even not find any suitable pipeline at all, in which case a RuntimeError('A pipeline has not yet been optimized. Please call fit() first.') will be raised. Often it is worthwhile to run multiple instances of TPOT in parallel for a long time (hours to days) to allow TPOT to thoroughly search the pipeline space for your dataset. AutoML algorithms can take a long time to finish their search AutoML algorithms aren't as simple as fitting one model on the dataset; they are considering multiple machine learning algorithms (random forests, linear models, SVMs, etc.) in a pipeline with multiple preprocessing steps (missing value imputation, scaling, PCA, feature selection, etc.), the hyperparameters for all of the models and preprocessing steps, as well as multiple ways to ensemble or stack the algorithms within the pipeline. As such, TPOT will take a while to run on larger datasets, but it's important to realize why. With the default TPOT settings (100 generations with 100 population size), TPOT will evaluate 10,000 pipeline configurations before finishing. To put this number into context, think about a grid search of 10,000 hyperparameter combinations for a machine learning algorithm and how long that grid search will take. That is 10,000 model configurations to evaluate with 10-fold cross-validation, which means that roughly 100,000 models are fit and evaluated on the training data in one grid search. That's a time-consuming procedure, even for simpler models like decision trees. Typical TPOT runs will take hours to days to finish (unless it's a small dataset), but you can always interrupt the run partway through and see the best results so far. TPOT also provides a warm_start parameter that lets you restart a TPOT run from where it left off. AutoML algorithms can recommend different solutions for the same dataset If you're working with a reasonably complex dataset or run TPOT for a short amount of time, different TPOT runs may result in different pipeline recommendations. TPOT's optimization algorithm is stochastic in nature, which means that it uses randomness (in part) to search the possible pipeline space. When two TPOT runs recommend different pipelines, this means that the TPOT runs didn't converge due to lack of time or that multiple pipelines perform more-or-less the same on your dataset. This is actually an advantage over fixed grid search techniques: TPOT is meant to be an assistant that gives you ideas on how to solve a particular machine learning problem by exploring pipeline configurations that you might have never considered, then leaves the fine-tuning to more constrained parameter tuning techniques such as grid search. TPOT with code We've taken care to design the TPOT interface to be as similar as possible to scikit-learn. TPOT can be imported just like any regular Python module. To import TPOT, type: from tpot import TPOTClassifier then create an instance of TPOT as follows: pipeline_optimizer = TPOTClassifier() It's also possible to use TPOT for regression problems with the TPOTRegressor class. Other than the class name, a TPOTRegressor is used the same way as a TPOTClassifier . You can read more about the TPOTClassifier and TPOTRegressor classes in the API documentation . Some example code with custom TPOT parameters might look like: pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) Now TPOT is ready to optimize a pipeline for you. You can tell TPOT to optimize a pipeline based on a data set with the fit function: pipeline_optimizer.fit(X_train, y_train) The fit function initializes the genetic programming algorithm to find the highest-scoring pipeline based on average k-fold cross-validation Then, the pipeline is trained on the entire set of provided samples, and the TPOT instance can be used as a fitted model. You can then proceed to evaluate the final pipeline on the testing set with the score function: print(pipeline_optimizer.score(X_test, y_test)) Finally, you can tell TPOT to export the corresponding Python code for the optimized pipeline to a text file with the export function: pipeline_optimizer.export('tpot_exported_pipeline.py') Once this code finishes running, tpot_exported_pipeline.py will contain the Python code for the optimized pipeline. Below is a full example script using TPOT to optimize a pipeline, score it, and export the best pipeline to a file. from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) pipeline_optimizer.fit(X_train, y_train) print(pipeline_optimizer.score(X_test, y_test)) pipeline_optimizer.export('tpot_exported_pipeline.py') Check our examples to see TPOT applied to some specific data sets. TPOT on the command line To use TPOT via the command line, enter the following command with a path to the data file: tpot /path_to/data_file.csv An example command-line call to TPOT may look like: tpot data/mnist.csv -is , -target class -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2 TPOT offers several arguments that can be provided at the command line. To see brief descriptions of these arguments, enter the following command: tpot --help Detailed descriptions of the command-line arguments are below. Argument Parameter Valid values Effect -is INPUT_SEPARATOR Any string Character used to separate columns in the input file. -target TARGET_NAME Any string Name of the target column in the input file. -mode TPOT_MODE ['classification', 'regression'] Whether TPOT is being used for a supervised classification or regression problem. -o OUTPUT_FILE String path to a file File to export the code for the final optimized pipeline. -g GENERATIONS Any positive integer or None Number of iterations to run the pipeline optimization process. It must be a positive number or None. If None, the parameter max_time_mins must be defined as the runtime limit. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -p POPULATION_SIZE Any positive integer Number of individuals to retain in the GP population every generation. Generally, TPOT will work better when you give it more individuals (and therefore time) to optimize the pipeline. TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -os OFFSPRING_SIZE Any positive integer Number of offspring to produce in each GP generation. By default, OFFSPRING_SIZE = POPULATION_SIZE. -mr MUTATION_RATE [0.0, 1.0] GP mutation rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to apply random changes to every generation. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. -xr CROSSOVER_RATE [0.0, 1.0] GP crossover rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to \"breed\" every generation. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. -scoring SCORING_FN 'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'my_module.scorer_name*' Function used to evaluate the quality of a given pipeline for the problem. By default, accuracy is used for classification and mean squared error (MSE) is used for regression. TPOT assumes that any function with \"error\" or \"loss\" in the name is meant to be minimized, whereas any other functions will be maximized. my_module.scorer_name: You can also specify your own function or a full python path to an existing one. See the section on scoring functions for more details. -cv CV Any integer > 1 Number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. -sub SUBSAMPLE (0.0, 1.0] Subsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process. -njobs NUM_JOBS Any positive integer or -1 Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process. Assigning this to -1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. -maxtime MAX_TIME_MINS Any positive integer How many minutes TPOT has to optimize the pipeline. How many minutes TPOT has to optimize the pipeline.If not None, this setting will allow TPOT to run until max_time_mins minutes elapsed and then stop. TPOT will stop earlier if generationsis set and all generations are already evaluated. -maxeval MAX_EVAL_MINS Any positive float How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to consider more complex pipelines but will also allow TPOT to run longer. -s RANDOM_STATE Any positive integer Random number generator seed for reproducibility. Set this seed if you want your TPOT run to be reproducible with the same seed and data set in the future. -config CONFIG_FILE String or file path Operators and parameter configurations in TPOT: Path for configuration file: TPOT will use the path to a configuration file for customizing the operators and parameters that TPOT uses in the optimization process string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. -template TEMPLATE String Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. -memory MEMORY String or file path If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. Memory caching mode in TPOT: Path for a caching directory: TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown. string 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown. -cf CHECKPOINT_FOLDER Folder path If supplied, a folder you created, in which tpot will periodically save pipelines in pareto front so far while optimizing. This is useful in multiple cases: sudden death before tpot could save an optimized pipeline progress tracking grabbing a pipeline while tpot is working Example: mkdir my_checkpoints -cf ./my_checkpoints -es EARLY_STOP Any positive integer How many generations TPOT checks whether there is no improvement in optimization process. End optimization process if there is no improvement in the set number of generations. -v VERBOSITY {0, 1, 2, 3} How much information TPOT communicates while it is running. 0 = none, 1 = minimal, 2 = high, 3 = all. A setting of 2 or higher will add a progress bar during the optimization procedure. -log LOG Folder path Save progress content to a file. --no-update-check Flag indicating whether the TPOT version checker should be disabled. --version Show TPOT's version number and exit. --help Show TPOT's help documentation and exit. Scoring functions TPOT makes use of sklearn.model_selection.cross_val_score for evaluating pipelines, and as such offers the same support for scoring functions. There are two ways to make use of scoring functions with TPOT: You can pass in a string to the scoring parameter from the list above. Any other strings will cause TPOT to throw an exception. You can pass the callable object/function with signature scorer(estimator, X, y) , where estimator is trained estimator to use for scoring, X are features that will be passed to estimator.predict and y are target values for X . To do this, you should implement your own function. See the example below for further explanation. from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split from sklearn.metrics import make_scorer digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) # Make a custom metric function def my_custom_accuracy(y_true, y_pred): return float(sum(y_pred == y_true)) / len(y_true) # Make a custom a scorer from the custom metric function # Note: greater_is_better=False in make_scorer below would mean that the scoring function should be minimized. my_custom_scorer = make_scorer(my_custom_accuracy, greater_is_better=True) tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, scoring=my_custom_scorer) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') my_module.scorer_name : You can also use a custom score_func(y_true, y_pred) or scorer(estimator, X, y) function through the command line by adding the argument -scoring my_module.scorer to your command-line call. TPOT will import your module and use the custom scoring function from there. TPOT will include your current working directory when importing the module, so you can place it in the same directory where you are going to run TPOT. Example: -scoring sklearn.metrics.auc will use the function auc from sklearn.metrics module. Built-in TPOT configurations TPOT comes with a handful of default operators and parameter configurations that we believe work well for optimizing machine learning pipelines. Below is a list of the current built-in configurations that come with TPOT. Configuration Name Description Operators Default TPOT TPOT will search over a broad range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Some of these operators are complex and may take a long time to run, especially on larger datasets. Note: This is the default configuration for TPOT. To use this configuration, use the default value (None) for the config_dict parameter. Classification Regression TPOT light TPOT will search over a restricted range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Only simpler and fast-running operators will be used in these pipelines, so TPOT light is useful for finding quick and simple pipelines for a classification or regression problem. This configuration works for both the TPOTClassifier and TPOTRegressor. Classification Regression TPOT MDR TPOT will search over a series of feature selectors and Multifactor Dimensionality Reduction models to find a series of operators that maximize prediction accuracy. The TPOT MDR configuration is specialized for genome-wide association studies (GWAS) , and is described in detail online here . Note that TPOT MDR may be slow to run because the feature selection routines are computationally expensive, especially on large datasets. Classification Regression TPOT sparse TPOT uses a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices. This configuration works for both the TPOTClassifier and TPOTRegressor. Classification Regression TPOT NN TPOT uses the same configuration as \"Default TPOT\" plus additional neural network estimators written in PyTorch (currently only `tpot.builtins.PytorchLRClassifier` and `tpot.builtins.PytorchMLPClassifier`). Currently only classification is supported, but future releases will include regression estimators. Classification TPOT cuML TPOT will search over a restricted configuration using the GPU-accelerated estimators in RAPIDS cuML and DMLC XGBoost . This configuration requires an NVIDIA Pascal architecture or better GPU with compute capability 6.0+, and that the library cuML is installed. With this configuration, all model training and predicting will be GPU-accelerated. This configuration is particularly useful for medium-sized and larger datasets on which CPU-based estimators are a common bottleneck, and works for both the TPOTClassifier and TPOTRegressor. Classification Regression To use any of these configurations, simply pass the string name of the configuration to the config_dict parameter (or -config on the command line). For example, to use the \"TPOT light\" configuration: from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, config_dict='TPOT light') tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Customizing TPOT's operators and parameters Beyond the default configurations that come with TPOT, in some cases it is useful to limit the algorithms and parameters that TPOT considers. For that reason, we allow users to provide TPOT with a custom configuration for its operators and parameters. The custom TPOT configuration must be in nested dictionary format, where the first level key is the path and name of the operator (e.g., sklearn.naive_bayes.MultinomialNB ) and the second level key is the corresponding parameter name for that operator (e.g., fit_prior ). The second level key should point to a list of parameter values for that parameter, e.g., 'fit_prior': [True, False] . For a simple example, the configuration could be: tpot_config = { 'sklearn.naive_bayes.GaussianNB': { }, 'sklearn.naive_bayes.BernoulliNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] }, 'sklearn.naive_bayes.MultinomialNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] } } in which case TPOT would only consider pipelines containing GaussianNB , BernoulliNB , MultinomialNB , and tune those algorithm's parameters in the ranges provided. This dictionary can be passed directly within the code to the TPOTClassifier / TPOTRegressor config_dict parameter, described above. For example: from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot_config = { 'sklearn.naive_bayes.GaussianNB': { }, 'sklearn.naive_bayes.BernoulliNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] }, 'sklearn.naive_bayes.MultinomialNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] } } tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, config_dict=tpot_config) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Command-line users must create a separate .py file with the custom configuration and provide the path to the file to the tpot call. For example, if the simple example configuration above is saved in tpot_classifier_config.py , that configuration could be used on the command line with the command: tpot data/mnist.csv -is , -target class -config tpot_classifier_config.py -g 5 -p 20 -v 2 -o tpot_exported_pipeline.py When using the command-line interface, the configuration file specified in the -config parameter must name its custom TPOT configuration tpot_config . Otherwise, TPOT will not be able to locate the configuration dictionary. For more detailed examples of how to customize TPOT's operator configuration, see the default configurations for classification and regression in TPOT's source code. Note that you must have all of the corresponding packages for the operators installed on your computer, otherwise TPOT will not be able to use them. For example, if XGBoost is not installed on your computer, then TPOT will simply not import nor use XGBoost in the pipelines it considers. Template option in TPOT Template option provides a way to specify a desired structure for machine learning pipeline, which may reduce TPOT computation time and potentially provide more interpretable results. Current implementation only supports linear pipelines. Below is a simple example to use template option. The pipelines generated/evaluated in TPOT will follow this structure: 1st step is a feature selector (a subclass of SelectorMixin ), 2nd step is a feature transformer (a subclass of TransformerMixin ) and 3rd step is a classifier for classification (a subclass of ClassifierMixin ). The last step must be Classifier for TPOTClassifier 's template but Regressor for TPOTRegressor . Note: although SelectorMixin is subclass of TransformerMixin in scikit-learn, but Transformer in this option excludes those subclasses of SelectorMixin . tpot_obj = TPOTClassifier( template='Selector-Transformer-Classifier' ) If a specific operator, e.g. SelectPercentile , is preferred for usage in the 1st step of the pipeline, the template can be defined like 'SelectPercentile-Transformer-Classifier'. FeatureSetSelector in TPOT FeatureSetSelector is a special new operator in TPOT. This operator enables feature selection based on priori expert knowledge. For example, in RNA-seq gene expression analysis, this operator can be used to select one or more gene (feature) set(s) based on GO (Gene Ontology) terms or annotated gene sets Molecular Signatures Database ( MSigDB ) in the 1st step of pipeline via template option above, in order to reduce dimensions and TPOT computation time. This operator requires a dataset list in csv format. In this csv file, there are only three columns: 1st column is feature set names, 2nd column is the total number of features in one set and 3rd column is a list of feature names (if input X is pandas.DataFrame) or indexes (if input X is numpy.ndarray) delimited by \";\". Below is an example how to use this operator in TPOT. Please check our preprint paper for more details. from tpot import TPOTClassifier import numpy as np import pandas as pd from tpot.config import classifier_config_dict test_data = pd.read_csv(\"https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/tests.csv\") test_X = test_data.drop(\"class\", axis=1) test_y = test_data['class'] # add FeatureSetSelector into tpot configuration classifier_config_dict['tpot.builtins.FeatureSetSelector'] = { 'subset_list': ['https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/subset_test.csv'], 'sel_subset': [0,1] # select only one feature set, a list of index of subset in the list above #'sel_subset': list(combinations(range(3), 2)) # select two feature sets } tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, template='FeatureSetSelector-Transformer-Classifier', config_dict=classifier_config_dict) tpot.fit(test_X, test_y) Pipeline caching in TPOT With the memory parameter, pipelines can cache the results of each transformer after fitting them. This feature is used to avoid repeated computation by transformers within a pipeline if the parameters and input data are identical to another fitted pipeline during optimization process. TPOT allows users to specify a custom directory path or joblib.Memory in case they want to re-use the memory cache in future TPOT runs (or a warm_start run). There are three methods for enabling memory caching in TPOT: from tpot import TPOTClassifier from tempfile import mkdtemp from joblib import Memory from shutil import rmtree # Method 1, auto mode: TPOT uses memory caching with a temporary directory and cleans it up upon shutdown tpot = TPOTClassifier(memory='auto') # Method 2, with a custom directory for memory caching tpot = TPOTClassifier(memory='/to/your/path') # Method 3, with a Memory object cachedir = mkdtemp() # Create a temporary folder memory = Memory(cachedir=cachedir, verbose=0) tpot = TPOTClassifier(memory=memory) # Clear the cache directory when you don't need it anymore rmtree(cachedir) Note: TPOT does NOT clean up memory caches if users set a custom directory path or Memory object. We recommend that you clean up the memory caches when you don't need it anymore. Crash/freeze issue with n_jobs > 1 under OSX or Linux Internally, TPOT uses joblib to fit estimators in parallel. This is the same parallelization framework used by scikit-learn. But it may crash/freeze with n_jobs > 1 under OSX or Linux as scikit-learn does , especially with large datasets. One solution is to configure Python's multiprocessing module to use the forkserver start method (instead of the default fork ) to manage the process pools. You can enable the forkserver mode globally for your program by putting the following codes into your main script: import multiprocessing # other imports, custom code, load data, define model... if __name__ == '__main__': multiprocessing.set_start_method('forkserver') # call scikit-learn utils or tpot utils with n_jobs > 1 here More information about these start methods can be found in the multiprocessing documentation . Parallel Training with Dask For large problems or working on Jupyter notebook, we highly recommend that you can distribute the work on a Dask cluster. The dask-examples binder has a runnable example with a small dask cluster. To use your Dask cluster to fit a TPOT model, specify the use_dask keyword when you create the TPOT estimator. Note: if use_dask=True , TPOT will use as many cores as available on the your Dask cluster. If n_jobs is specified, then it will control the chunk size (10* n_jobs if it is less then offspring size) of parallel training. estimator = TPOTEstimator(use_dask=True, n_jobs=-1) This will use all the workers on your cluster to do the training, and use Dask-ML's pipeline rewriting to avoid re-fitting estimators multiple times on the same set of data. It will also provide fine-grained diagnostics in the distributed scheduler UI . Alternatively, Dask implements a joblib backend. You can instruct TPOT to use the distributed backend during training by specifying a joblib.parallel_backend : import joblib import distributed.joblib from dask.distributed import Client # connect to the cluster client = Client('schedueler-address') # create the estimator normally estimator = TPOTClassifier(n_jobs=-1) # perform the fit in this context manager with joblib.parallel_backend(\"dask\"): estimator.fit(X, y) See dask's distributed joblib integration for more. Neural Networks in TPOT ( tpot.nn ) Support for neural network models and deep learning is an experimental feature newly added to TPOT. Available neural network architectures are provided by the tpot.nn module. Unlike regular sklearn estimators, these models need to be written by hand, and must also inherit the appropriate base classes provided by sklearn for all of their built-in modules. In other words, they need implement methods like .fit() , fit_transform() , get_params() , etc., as described in detail on Developing scikit-learn estimators . Telling TPOT to use built-in PyTorch neural network models Mainly due to the issues described below, TPOT won't use its neural network models unless you explicitly tell it to do so. This is done as follows: Use import tpot.nn before instantiating any TPOT estimators. Use a configuration dictionary that includes one or more tpot.nn estimators, either by writing one manually, including one from a file, or by importing the configuration in tpot/config/classifier_nn.py . A very simple example that will force TPOT to only use a PyTorch-based logistic regression classifier as its main estimator is as follows: tpot_config = { 'tpot.nn.PytorchLRClassifier': { 'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.] } } Alternatively, use a template string including PytorchLRClassifier or PytorchMLPClassifier while loading the TPOT-NN configuration dictionary. Neural network models are notorious for being extremely sensitive to their initialization parameters, so you may need to heavily adjust tpot.nn configuration dictionaries in order to attain good performance on your dataset. A simple example of using TPOT-NN is shown in examples . Important caveats Neural network models (especially when they reach moderately large sizes) take a notoriously large amount of time and computing power to train. You should expect tpot.nn neural networks to train several orders of magnitude slower than their sklearn alternatives. This can be alleviated somewhat by training the models on computers with CUDA-enabled GPUs. TPOT will occasionally learn pipelines that stack several sklearn estimators. Mathematically, these can be nearly identical to some deep learning models. For example, by stacking several sklearn.linear_model.LogisticRegression s, you end up with a very close approximation of a Multilayer Perceptron; one of the simplest and most well known deep learning architectures. TPOT's genetic programming algorithms generally optimize these 'networks' much faster than PyTorch, which typically uses a more brute-force convex optimization approach. The problem of 'black box' model introspection is one of the most substantial criticisms and challenges of deep learning. This problem persists in tpot.nn , whereas TPOT's default estimators often are far easier to introspect.","title":"Using TPOT"},{"location":"using/#using-tpot","text":"","title":"Using TPOT"},{"location":"using/#what-to-expect-from-automl-software","text":"Automated machine learning (AutoML) takes a higher-level approach to machine learning than most practitioners are used to, so we've gathered a handful of guidelines on what to expect when running AutoML software such as TPOT.","title":"What to expect from AutoML software"},{"location":"using/#tpot-with-code","text":"We've taken care to design the TPOT interface to be as similar as possible to scikit-learn. TPOT can be imported just like any regular Python module. To import TPOT, type: from tpot import TPOTClassifier then create an instance of TPOT as follows: pipeline_optimizer = TPOTClassifier() It's also possible to use TPOT for regression problems with the TPOTRegressor class. Other than the class name, a TPOTRegressor is used the same way as a TPOTClassifier . You can read more about the TPOTClassifier and TPOTRegressor classes in the API documentation . Some example code with custom TPOT parameters might look like: pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) Now TPOT is ready to optimize a pipeline for you. You can tell TPOT to optimize a pipeline based on a data set with the fit function: pipeline_optimizer.fit(X_train, y_train) The fit function initializes the genetic programming algorithm to find the highest-scoring pipeline based on average k-fold cross-validation Then, the pipeline is trained on the entire set of provided samples, and the TPOT instance can be used as a fitted model. You can then proceed to evaluate the final pipeline on the testing set with the score function: print(pipeline_optimizer.score(X_test, y_test)) Finally, you can tell TPOT to export the corresponding Python code for the optimized pipeline to a text file with the export function: pipeline_optimizer.export('tpot_exported_pipeline.py') Once this code finishes running, tpot_exported_pipeline.py will contain the Python code for the optimized pipeline. Below is a full example script using TPOT to optimize a pipeline, score it, and export the best pipeline to a file. from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) pipeline_optimizer.fit(X_train, y_train) print(pipeline_optimizer.score(X_test, y_test)) pipeline_optimizer.export('tpot_exported_pipeline.py') Check our examples to see TPOT applied to some specific data sets.","title":"TPOT with code"},{"location":"using/#tpot-on-the-command-line","text":"To use TPOT via the command line, enter the following command with a path to the data file: tpot /path_to/data_file.csv An example command-line call to TPOT may look like: tpot data/mnist.csv -is , -target class -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2 TPOT offers several arguments that can be provided at the command line. To see brief descriptions of these arguments, enter the following command: tpot --help Detailed descriptions of the command-line arguments are below. Argument Parameter Valid values Effect -is INPUT_SEPARATOR Any string Character used to separate columns in the input file. -target TARGET_NAME Any string Name of the target column in the input file. -mode TPOT_MODE ['classification', 'regression'] Whether TPOT is being used for a supervised classification or regression problem. -o OUTPUT_FILE String path to a file File to export the code for the final optimized pipeline. -g GENERATIONS Any positive integer or None Number of iterations to run the pipeline optimization process. It must be a positive number or None. If None, the parameter max_time_mins must be defined as the runtime limit. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -p POPULATION_SIZE Any positive integer Number of individuals to retain in the GP population every generation. Generally, TPOT will work better when you give it more individuals (and therefore time) to optimize the pipeline. TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -os OFFSPRING_SIZE Any positive integer Number of offspring to produce in each GP generation. By default, OFFSPRING_SIZE = POPULATION_SIZE. -mr MUTATION_RATE [0.0, 1.0] GP mutation rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to apply random changes to every generation. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. -xr CROSSOVER_RATE [0.0, 1.0] GP crossover rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to \"breed\" every generation. We recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. -scoring SCORING_FN 'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'my_module.scorer_name*' Function used to evaluate the quality of a given pipeline for the problem. By default, accuracy is used for classification and mean squared error (MSE) is used for regression. TPOT assumes that any function with \"error\" or \"loss\" in the name is meant to be minimized, whereas any other functions will be maximized. my_module.scorer_name: You can also specify your own function or a full python path to an existing one. See the section on scoring functions for more details. -cv CV Any integer > 1 Number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. -sub SUBSAMPLE (0.0, 1.0] Subsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process. -njobs NUM_JOBS Any positive integer or -1 Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process. Assigning this to -1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. -maxtime MAX_TIME_MINS Any positive integer How many minutes TPOT has to optimize the pipeline. How many minutes TPOT has to optimize the pipeline.If not None, this setting will allow TPOT to run until max_time_mins minutes elapsed and then stop. TPOT will stop earlier if generationsis set and all generations are already evaluated. -maxeval MAX_EVAL_MINS Any positive float How many minutes TPOT has to evaluate a single pipeline. Setting this parameter to higher values will allow TPOT to consider more complex pipelines but will also allow TPOT to run longer. -s RANDOM_STATE Any positive integer Random number generator seed for reproducibility. Set this seed if you want your TPOT run to be reproducible with the same seed and data set in the future. -config CONFIG_FILE String or file path Operators and parameter configurations in TPOT: Path for configuration file: TPOT will use the path to a configuration file for customizing the operators and parameters that TPOT uses in the optimization process string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices. See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. -template TEMPLATE String Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is None, TPOT generates tree-based pipeline randomly. See the template option in tpot section for more details. -memory MEMORY String or file path If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. Memory caching mode in TPOT: Path for a caching directory: TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown. string 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown. -cf CHECKPOINT_FOLDER Folder path If supplied, a folder you created, in which tpot will periodically save pipelines in pareto front so far while optimizing. This is useful in multiple cases: sudden death before tpot could save an optimized pipeline progress tracking grabbing a pipeline while tpot is working Example: mkdir my_checkpoints -cf ./my_checkpoints -es EARLY_STOP Any positive integer How many generations TPOT checks whether there is no improvement in optimization process. End optimization process if there is no improvement in the set number of generations. -v VERBOSITY {0, 1, 2, 3} How much information TPOT communicates while it is running. 0 = none, 1 = minimal, 2 = high, 3 = all. A setting of 2 or higher will add a progress bar during the optimization procedure. -log LOG Folder path Save progress content to a file. --no-update-check Flag indicating whether the TPOT version checker should be disabled. --version Show TPOT's version number and exit. --help Show TPOT's help documentation and exit.","title":"TPOT on the command line"},{"location":"using/#scoring-functions","text":"TPOT makes use of sklearn.model_selection.cross_val_score for evaluating pipelines, and as such offers the same support for scoring functions. There are two ways to make use of scoring functions with TPOT: You can pass in a string to the scoring parameter from the list above. Any other strings will cause TPOT to throw an exception. You can pass the callable object/function with signature scorer(estimator, X, y) , where estimator is trained estimator to use for scoring, X are features that will be passed to estimator.predict and y are target values for X . To do this, you should implement your own function. See the example below for further explanation. from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split from sklearn.metrics import make_scorer digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) # Make a custom metric function def my_custom_accuracy(y_true, y_pred): return float(sum(y_pred == y_true)) / len(y_true) # Make a custom a scorer from the custom metric function # Note: greater_is_better=False in make_scorer below would mean that the scoring function should be minimized. my_custom_scorer = make_scorer(my_custom_accuracy, greater_is_better=True) tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, scoring=my_custom_scorer) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') my_module.scorer_name : You can also use a custom score_func(y_true, y_pred) or scorer(estimator, X, y) function through the command line by adding the argument -scoring my_module.scorer to your command-line call. TPOT will import your module and use the custom scoring function from there. TPOT will include your current working directory when importing the module, so you can place it in the same directory where you are going to run TPOT. Example: -scoring sklearn.metrics.auc will use the function auc from sklearn.metrics module.","title":"Scoring functions"},{"location":"using/#built-in-tpot-configurations","text":"TPOT comes with a handful of default operators and parameter configurations that we believe work well for optimizing machine learning pipelines. Below is a list of the current built-in configurations that come with TPOT. Configuration Name Description Operators Default TPOT TPOT will search over a broad range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Some of these operators are complex and may take a long time to run, especially on larger datasets. Note: This is the default configuration for TPOT. To use this configuration, use the default value (None) for the config_dict parameter. Classification Regression TPOT light TPOT will search over a restricted range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Only simpler and fast-running operators will be used in these pipelines, so TPOT light is useful for finding quick and simple pipelines for a classification or regression problem. This configuration works for both the TPOTClassifier and TPOTRegressor. Classification Regression TPOT MDR TPOT will search over a series of feature selectors and Multifactor Dimensionality Reduction models to find a series of operators that maximize prediction accuracy. The TPOT MDR configuration is specialized for genome-wide association studies (GWAS) , and is described in detail online here . Note that TPOT MDR may be slow to run because the feature selection routines are computationally expensive, especially on large datasets. Classification Regression TPOT sparse TPOT uses a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices. This configuration works for both the TPOTClassifier and TPOTRegressor. Classification Regression TPOT NN TPOT uses the same configuration as \"Default TPOT\" plus additional neural network estimators written in PyTorch (currently only `tpot.builtins.PytorchLRClassifier` and `tpot.builtins.PytorchMLPClassifier`). Currently only classification is supported, but future releases will include regression estimators. Classification TPOT cuML TPOT will search over a restricted configuration using the GPU-accelerated estimators in RAPIDS cuML and DMLC XGBoost . This configuration requires an NVIDIA Pascal architecture or better GPU with compute capability 6.0+, and that the library cuML is installed. With this configuration, all model training and predicting will be GPU-accelerated. This configuration is particularly useful for medium-sized and larger datasets on which CPU-based estimators are a common bottleneck, and works for both the TPOTClassifier and TPOTRegressor. Classification Regression To use any of these configurations, simply pass the string name of the configuration to the config_dict parameter (or -config on the command line). For example, to use the \"TPOT light\" configuration: from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, config_dict='TPOT light') tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py')","title":"Built-in TPOT configurations"},{"location":"using/#customizing-tpots-operators-and-parameters","text":"Beyond the default configurations that come with TPOT, in some cases it is useful to limit the algorithms and parameters that TPOT considers. For that reason, we allow users to provide TPOT with a custom configuration for its operators and parameters. The custom TPOT configuration must be in nested dictionary format, where the first level key is the path and name of the operator (e.g., sklearn.naive_bayes.MultinomialNB ) and the second level key is the corresponding parameter name for that operator (e.g., fit_prior ). The second level key should point to a list of parameter values for that parameter, e.g., 'fit_prior': [True, False] . For a simple example, the configuration could be: tpot_config = { 'sklearn.naive_bayes.GaussianNB': { }, 'sklearn.naive_bayes.BernoulliNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] }, 'sklearn.naive_bayes.MultinomialNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] } } in which case TPOT would only consider pipelines containing GaussianNB , BernoulliNB , MultinomialNB , and tune those algorithm's parameters in the ranges provided. This dictionary can be passed directly within the code to the TPOTClassifier / TPOTRegressor config_dict parameter, described above. For example: from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) tpot_config = { 'sklearn.naive_bayes.GaussianNB': { }, 'sklearn.naive_bayes.BernoulliNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] }, 'sklearn.naive_bayes.MultinomialNB': { 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], 'fit_prior': [True, False] } } tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, config_dict=tpot_config) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py') Command-line users must create a separate .py file with the custom configuration and provide the path to the file to the tpot call. For example, if the simple example configuration above is saved in tpot_classifier_config.py , that configuration could be used on the command line with the command: tpot data/mnist.csv -is , -target class -config tpot_classifier_config.py -g 5 -p 20 -v 2 -o tpot_exported_pipeline.py When using the command-line interface, the configuration file specified in the -config parameter must name its custom TPOT configuration tpot_config . Otherwise, TPOT will not be able to locate the configuration dictionary. For more detailed examples of how to customize TPOT's operator configuration, see the default configurations for classification and regression in TPOT's source code. Note that you must have all of the corresponding packages for the operators installed on your computer, otherwise TPOT will not be able to use them. For example, if XGBoost is not installed on your computer, then TPOT will simply not import nor use XGBoost in the pipelines it considers.","title":"Customizing TPOT's operators and parameters"},{"location":"using/#template-option-in-tpot","text":"Template option provides a way to specify a desired structure for machine learning pipeline, which may reduce TPOT computation time and potentially provide more interpretable results. Current implementation only supports linear pipelines. Below is a simple example to use template option. The pipelines generated/evaluated in TPOT will follow this structure: 1st step is a feature selector (a subclass of SelectorMixin ), 2nd step is a feature transformer (a subclass of TransformerMixin ) and 3rd step is a classifier for classification (a subclass of ClassifierMixin ). The last step must be Classifier for TPOTClassifier 's template but Regressor for TPOTRegressor . Note: although SelectorMixin is subclass of TransformerMixin in scikit-learn, but Transformer in this option excludes those subclasses of SelectorMixin . tpot_obj = TPOTClassifier( template='Selector-Transformer-Classifier' ) If a specific operator, e.g. SelectPercentile , is preferred for usage in the 1st step of the pipeline, the template can be defined like 'SelectPercentile-Transformer-Classifier'.","title":"Template option in TPOT"},{"location":"using/#featuresetselector-in-tpot","text":"FeatureSetSelector is a special new operator in TPOT. This operator enables feature selection based on priori expert knowledge. For example, in RNA-seq gene expression analysis, this operator can be used to select one or more gene (feature) set(s) based on GO (Gene Ontology) terms or annotated gene sets Molecular Signatures Database ( MSigDB ) in the 1st step of pipeline via template option above, in order to reduce dimensions and TPOT computation time. This operator requires a dataset list in csv format. In this csv file, there are only three columns: 1st column is feature set names, 2nd column is the total number of features in one set and 3rd column is a list of feature names (if input X is pandas.DataFrame) or indexes (if input X is numpy.ndarray) delimited by \";\". Below is an example how to use this operator in TPOT. Please check our preprint paper for more details. from tpot import TPOTClassifier import numpy as np import pandas as pd from tpot.config import classifier_config_dict test_data = pd.read_csv(\"https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/tests.csv\") test_X = test_data.drop(\"class\", axis=1) test_y = test_data['class'] # add FeatureSetSelector into tpot configuration classifier_config_dict['tpot.builtins.FeatureSetSelector'] = { 'subset_list': ['https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/subset_test.csv'], 'sel_subset': [0,1] # select only one feature set, a list of index of subset in the list above #'sel_subset': list(combinations(range(3), 2)) # select two feature sets } tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, template='FeatureSetSelector-Transformer-Classifier', config_dict=classifier_config_dict) tpot.fit(test_X, test_y)","title":"FeatureSetSelector in TPOT"},{"location":"using/#pipeline-caching-in-tpot","text":"With the memory parameter, pipelines can cache the results of each transformer after fitting them. This feature is used to avoid repeated computation by transformers within a pipeline if the parameters and input data are identical to another fitted pipeline during optimization process. TPOT allows users to specify a custom directory path or joblib.Memory in case they want to re-use the memory cache in future TPOT runs (or a warm_start run). There are three methods for enabling memory caching in TPOT: from tpot import TPOTClassifier from tempfile import mkdtemp from joblib import Memory from shutil import rmtree # Method 1, auto mode: TPOT uses memory caching with a temporary directory and cleans it up upon shutdown tpot = TPOTClassifier(memory='auto') # Method 2, with a custom directory for memory caching tpot = TPOTClassifier(memory='/to/your/path') # Method 3, with a Memory object cachedir = mkdtemp() # Create a temporary folder memory = Memory(cachedir=cachedir, verbose=0) tpot = TPOTClassifier(memory=memory) # Clear the cache directory when you don't need it anymore rmtree(cachedir) Note: TPOT does NOT clean up memory caches if users set a custom directory path or Memory object. We recommend that you clean up the memory caches when you don't need it anymore.","title":"Pipeline caching in TPOT"},{"location":"using/#crashfreeze-issue-with-n_jobs-1-under-osx-or-linux","text":"Internally, TPOT uses joblib to fit estimators in parallel. This is the same parallelization framework used by scikit-learn. But it may crash/freeze with n_jobs > 1 under OSX or Linux as scikit-learn does , especially with large datasets. One solution is to configure Python's multiprocessing module to use the forkserver start method (instead of the default fork ) to manage the process pools. You can enable the forkserver mode globally for your program by putting the following codes into your main script: import multiprocessing # other imports, custom code, load data, define model... if __name__ == '__main__': multiprocessing.set_start_method('forkserver') # call scikit-learn utils or tpot utils with n_jobs > 1 here More information about these start methods can be found in the multiprocessing documentation .","title":"Crash/freeze issue with n_jobs > 1 under OSX or Linux"},{"location":"using/#parallel-training-with-dask","text":"For large problems or working on Jupyter notebook, we highly recommend that you can distribute the work on a Dask cluster. The dask-examples binder has a runnable example with a small dask cluster. To use your Dask cluster to fit a TPOT model, specify the use_dask keyword when you create the TPOT estimator. Note: if use_dask=True , TPOT will use as many cores as available on the your Dask cluster. If n_jobs is specified, then it will control the chunk size (10* n_jobs if it is less then offspring size) of parallel training. estimator = TPOTEstimator(use_dask=True, n_jobs=-1) This will use all the workers on your cluster to do the training, and use Dask-ML's pipeline rewriting to avoid re-fitting estimators multiple times on the same set of data. It will also provide fine-grained diagnostics in the distributed scheduler UI . Alternatively, Dask implements a joblib backend. You can instruct TPOT to use the distributed backend during training by specifying a joblib.parallel_backend : import joblib import distributed.joblib from dask.distributed import Client # connect to the cluster client = Client('schedueler-address') # create the estimator normally estimator = TPOTClassifier(n_jobs=-1) # perform the fit in this context manager with joblib.parallel_backend(\"dask\"): estimator.fit(X, y) See dask's distributed joblib integration for more.","title":"Parallel Training with Dask"},{"location":"using/#neural-networks-in-tpot-tpotnn","text":"Support for neural network models and deep learning is an experimental feature newly added to TPOT. Available neural network architectures are provided by the tpot.nn module. Unlike regular sklearn estimators, these models need to be written by hand, and must also inherit the appropriate base classes provided by sklearn for all of their built-in modules. In other words, they need implement methods like .fit() , fit_transform() , get_params() , etc., as described in detail on Developing scikit-learn estimators .","title":"Neural Networks in TPOT (tpot.nn)"},{"location":"using/#telling-tpot-to-use-built-in-pytorch-neural-network-models","text":"Mainly due to the issues described below, TPOT won't use its neural network models unless you explicitly tell it to do so. This is done as follows: Use import tpot.nn before instantiating any TPOT estimators. Use a configuration dictionary that includes one or more tpot.nn estimators, either by writing one manually, including one from a file, or by importing the configuration in tpot/config/classifier_nn.py . A very simple example that will force TPOT to only use a PyTorch-based logistic regression classifier as its main estimator is as follows: tpot_config = { 'tpot.nn.PytorchLRClassifier': { 'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.] } } Alternatively, use a template string including PytorchLRClassifier or PytorchMLPClassifier while loading the TPOT-NN configuration dictionary. Neural network models are notorious for being extremely sensitive to their initialization parameters, so you may need to heavily adjust tpot.nn configuration dictionaries in order to attain good performance on your dataset. A simple example of using TPOT-NN is shown in examples .","title":"Telling TPOT to use built-in PyTorch neural network models"},{"location":"using/#important-caveats","text":"Neural network models (especially when they reach moderately large sizes) take a notoriously large amount of time and computing power to train. You should expect tpot.nn neural networks to train several orders of magnitude slower than their sklearn alternatives. This can be alleviated somewhat by training the models on computers with CUDA-enabled GPUs. TPOT will occasionally learn pipelines that stack several sklearn estimators. Mathematically, these can be nearly identical to some deep learning models. For example, by stacking several sklearn.linear_model.LogisticRegression s, you end up with a very close approximation of a Multilayer Perceptron; one of the simplest and most well known deep learning architectures. TPOT's genetic programming algorithms generally optimize these 'networks' much faster than PyTorch, which typically uses a more brute-force convex optimization approach. The problem of 'black box' model introspection is one of the most substantial criticisms and challenges of deep learning. This problem persists in tpot.nn , whereas TPOT's default estimators often are far easier to introspect.","title":"Important caveats"}]} \ No newline at end of file diff --git a/docs/sitemap.xml b/docs/sitemap.xml index eba5ad03..ef12a0f0 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -1,43 +1,43 @@ http://epistasislab.github.io/tpot/ - 2020-07-21 + 2020-10-26 daily http://epistasislab.github.io/tpot/installing/ - 2020-07-21 + 2020-10-26 daily http://epistasislab.github.io/tpot/using/ - 2020-07-21 + 2020-10-26 daily http://epistasislab.github.io/tpot/api/ - 2020-07-21 + 2020-10-26 daily http://epistasislab.github.io/tpot/examples/ - 2020-07-21 + 2020-10-26 daily http://epistasislab.github.io/tpot/contributing/ - 2020-07-21 + 2020-10-26 daily http://epistasislab.github.io/tpot/releases/ - 2020-07-21 + 2020-10-26 daily http://epistasislab.github.io/tpot/citing/ - 2020-07-21 + 2020-10-26 daily http://epistasislab.github.io/tpot/support/ - 2020-07-21 + 2020-10-26 daily http://epistasislab.github.io/tpot/related/ - 2020-07-21 + 2020-10-26 daily \ No newline at end of file diff --git a/docs/sitemap.xml.gz b/docs/sitemap.xml.gz index 3ec7e0f5ba81e0aeae67fa4fe64720b6c2edd817..a9c444beccd4d725626afe81bc50f5d9139c8626 100644 GIT binary patch literal 269 zcmV+o0rLJIiwFoB-j-hi|8r?{Wo=<_E_iKh0L_%aYQ-QBMfdrN;69p{MQMxi?fM1n z2bg4<3>dY}q5u@m*!Di zO{KTw2)NE>PrX=})axT|V64ge90ojNp;lRzV(E*jjGzQNJhm8wcT>=M|3hN3>Yl*fPL*l{~XBnn{ TL#pmq{`So`wt()yCI$ciuh@pw diff --git a/docs/using/index.html b/docs/using/index.html index ad2c893d..35348646 100644 --- a/docs/using/index.html +++ b/docs/using/index.html @@ -195,37 +195,31 @@
          AutoML algorithms can recommend different solutions for the same dataset
          TPOT with code

          We've taken care to design the TPOT interface to be as similar as possible to scikit-learn.

          TPOT can be imported just like any regular Python module. To import TPOT, type:

          -
          from tpot import TPOTClassifier
          +
          from tpot import TPOTClassifier
           
          -

          then create an instance of TPOT as follows:

          -
          pipeline_optimizer = TPOTClassifier()
          +
          pipeline_optimizer = TPOTClassifier()
           
          -

          It's also possible to use TPOT for regression problems with the TPOTRegressor class. Other than the class name, a TPOTRegressor is used the same way as a TPOTClassifier. You can read more about the TPOTClassifier and TPOTRegressor classes in the API documentation.

          Some example code with custom TPOT parameters might look like:

          -
          pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,
          +
          pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,
                                               random_state=42, verbosity=2)
           
          -

          Now TPOT is ready to optimize a pipeline for you. You can tell TPOT to optimize a pipeline based on a data set with the fit function:

          -
          pipeline_optimizer.fit(X_train, y_train)
          +
          pipeline_optimizer.fit(X_train, y_train)
           
          -

          The fit function initializes the genetic programming algorithm to find the highest-scoring pipeline based on average k-fold cross-validation Then, the pipeline is trained on the entire set of provided samples, and the TPOT instance can be used as a fitted model.

          You can then proceed to evaluate the final pipeline on the testing set with the score function:

          -
          print(pipeline_optimizer.score(X_test, y_test))
          +
          print(pipeline_optimizer.score(X_test, y_test))
           
          -

          Finally, you can tell TPOT to export the corresponding Python code for the optimized pipeline to a text file with the export function:

          -
          pipeline_optimizer.export('tpot_exported_pipeline.py')
          +
          pipeline_optimizer.export('tpot_exported_pipeline.py')
           
          -

          Once this code finishes running, tpot_exported_pipeline.py will contain the Python code for the optimized pipeline.

          Below is a full example script using TPOT to optimize a pipeline, score it, and export the best pipeline to a file.

          -
          from tpot import TPOTClassifier
          +
          from tpot import TPOTClassifier
           from sklearn.datasets import load_digits
           from sklearn.model_selection import train_test_split
           
          @@ -239,22 +233,18 @@ 

          TPOT with code

          print(pipeline_optimizer.score(X_test, y_test)) pipeline_optimizer.export('tpot_exported_pipeline.py')
          -

          Check our examples to see TPOT applied to some specific data sets.

          TPOT on the command line

          To use TPOT via the command line, enter the following command with a path to the data file:

          -
          tpot /path_to/data_file.csv
          +
          tpot /path_to/data_file.csv
           
          -

          An example command-line call to TPOT may look like:

          -
          tpot data/mnist.csv -is , -target class -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2
          +
          tpot data/mnist.csv -is , -target class -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2
           
          -

          TPOT offers several arguments that can be provided at the command line. To see brief descriptions of these arguments, enter the following command:

          -
          tpot --help
          +
          tpot --help
           
          -

          Detailed descriptions of the command-line arguments are below.

          @@ -491,7 +481,7 @@

          Scoring functions

          You can pass the callable object/function with signature scorer(estimator, X, y), where estimator is trained estimator to use for scoring, X are features that will be passed to estimator.predict and y are target values for X. To do this, you should implement your own function. See the example below for further explanation.

          -
          from tpot import TPOTClassifier
          +
          from tpot import TPOTClassifier
           from sklearn.datasets import load_digits
           from sklearn.model_selection import train_test_split
           from sklearn.metrics import make_scorer
          @@ -513,7 +503,6 @@ 

          Scoring functions

          print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py')
          -
          • my_module.scorer_name: You can also use a custom score_func(y_true, y_pred) or scorer(estimator, X, y) function through the command line by adding the argument -scoring my_module.scorer to your command-line call. TPOT will import your module and use the custom scoring function from there. TPOT will include your current working directory when importing the module, so you can place it in the same directory where you are going to run TPOT. Example: -scoring sklearn.metrics.auc will use the function auc from sklearn.metrics module.
          • @@ -568,17 +557,27 @@

            Built-in TPOT configurations

          - + + + + + + +
          TPOT-NNTPOT NN TPOT uses the same configuration as "Default TPOT" plus additional neural network estimators written in PyTorch (currently only `tpot.builtins.PytorchLRClassifier` and `tpot.builtins.PytorchMLPClassifier`).

          Currently only classification is supported, but future releases will include regression estimators.
          Classification
          TPOT cuMLTPOT will search over a restricted configuration using the GPU-accelerated estimators in RAPIDS cuML and DMLC XGBoost. This configuration requires an NVIDIA Pascal architecture or better GPU with compute capability 6.0+, and that the library cuML is installed. With this configuration, all model training and predicting will be GPU-accelerated. +

          +This configuration is particularly useful for medium-sized and larger datasets on which CPU-based estimators are a common bottleneck, and works for both the TPOTClassifier and TPOTRegressor.
          Classification +

          +Regression

          To use any of these configurations, simply pass the string name of the configuration to the config_dict parameter (or -config on the command line). For example, to use the "TPOT light" configuration:

          -
          from tpot import TPOTClassifier
          +
          from tpot import TPOTClassifier
           from sklearn.datasets import load_digits
           from sklearn.model_selection import train_test_split
           
          @@ -593,12 +592,11 @@ 

          Built-in TPOT configurations

          tpot.export('tpot_digits_pipeline.py')
          -

          Customizing TPOT's operators and parameters

          Beyond the default configurations that come with TPOT, in some cases it is useful to limit the algorithms and parameters that TPOT considers. For that reason, we allow users to provide TPOT with a custom configuration for its operators and parameters.

          The custom TPOT configuration must be in nested dictionary format, where the first level key is the path and name of the operator (e.g., sklearn.naive_bayes.MultinomialNB) and the second level key is the corresponding parameter name for that operator (e.g., fit_prior). The second level key should point to a list of parameter values for that parameter, e.g., 'fit_prior': [True, False].

          For a simple example, the configuration could be:

          -
          tpot_config = {
          +
          tpot_config = {
               'sklearn.naive_bayes.GaussianNB': {
               },
           
          @@ -613,9 +611,8 @@ 

          Customizing TPOT's operators } }

          -

          in which case TPOT would only consider pipelines containing GaussianNB, BernoulliNB, MultinomialNB, and tune those algorithm's parameters in the ranges provided. This dictionary can be passed directly within the code to the TPOTClassifier/TPOTRegressor config_dict parameter, described above. For example:

          -
          from tpot import TPOTClassifier
          +
          from tpot import TPOTClassifier
           from sklearn.datasets import load_digits
           from sklearn.model_selection import train_test_split
           
          @@ -644,27 +641,24 @@ 

          Customizing TPOT's operators print(tpot.score(X_test, y_test)) tpot.export('tpot_digits_pipeline.py')

          -

          Command-line users must create a separate .py file with the custom configuration and provide the path to the file to the tpot call. For example, if the simple example configuration above is saved in tpot_classifier_config.py, that configuration could be used on the command line with the command:

          tpot data/mnist.csv -is , -target class -config tpot_classifier_config.py -g 5 -p 20 -v 2 -o tpot_exported_pipeline.py
           
          -

          When using the command-line interface, the configuration file specified in the -config parameter must name its custom TPOT configuration tpot_config. Otherwise, TPOT will not be able to locate the configuration dictionary.

          For more detailed examples of how to customize TPOT's operator configuration, see the default configurations for classification and regression in TPOT's source code.

          Note that you must have all of the corresponding packages for the operators installed on your computer, otherwise TPOT will not be able to use them. For example, if XGBoost is not installed on your computer, then TPOT will simply not import nor use XGBoost in the pipelines it considers.

          Template option in TPOT

          Template option provides a way to specify a desired structure for machine learning pipeline, which may reduce TPOT computation time and potentially provide more interpretable results. Current implementation only supports linear pipelines.

          Below is a simple example to use template option. The pipelines generated/evaluated in TPOT will follow this structure: 1st step is a feature selector (a subclass of SelectorMixin), 2nd step is a feature transformer (a subclass of TransformerMixin) and 3rd step is a classifier for classification (a subclass of ClassifierMixin). The last step must be Classifier for TPOTClassifier's template but Regressor for TPOTRegressor. Note: although SelectorMixin is subclass of TransformerMixin in scikit-learn, but Transformer in this option excludes those subclasses of SelectorMixin.

          -
          tpot_obj = TPOTClassifier(
          +
          tpot_obj = TPOTClassifier(
                           template='Selector-Transformer-Classifier'
                           )
           
          -

          If a specific operator, e.g. SelectPercentile, is preferred for usage in the 1st step of the pipeline, the template can be defined like 'SelectPercentile-Transformer-Classifier'.

          FeatureSetSelector in TPOT

          -

          FeatureSetSelector is a special new operator in TPOT. This operator enables feature selection based on priori expert knowledge. For example, in RNA-seq gene expression analysis, this operator can be used to select one or more gene (feature) set(s) based on GO (Gene Ontology) terms or annotated gene sets Molecular Signatures Database (MSigDB) in the 1st step of pipeline via template option above, in order to reduce dimensions and TPOT computation time. This operator requires a dataset list in csv format. In this csv file, there are only three columns: 1st column is feature set names, 2nd column is the total number of features in one set and 3rd column is a list of feature names (if input X is pandas.DataFrame) or indexes (if input X is numpy.ndarray) delimited by ";". Below is a example how to use this operator in TPOT.

          +

          FeatureSetSelector is a special new operator in TPOT. This operator enables feature selection based on priori expert knowledge. For example, in RNA-seq gene expression analysis, this operator can be used to select one or more gene (feature) set(s) based on GO (Gene Ontology) terms or annotated gene sets Molecular Signatures Database (MSigDB) in the 1st step of pipeline via template option above, in order to reduce dimensions and TPOT computation time. This operator requires a dataset list in csv format. In this csv file, there are only three columns: 1st column is feature set names, 2nd column is the total number of features in one set and 3rd column is a list of feature names (if input X is pandas.DataFrame) or indexes (if input X is numpy.ndarray) delimited by ";". Below is an example how to use this operator in TPOT.

          Please check our preprint paper for more details.

          -
          from tpot import TPOTClassifier
          +
          from tpot import TPOTClassifier
           import numpy as np
           import pandas as pd
           from tpot.config import classifier_config_dict
          @@ -686,11 +680,10 @@ 

          FeatureSetSelector in TPOT

          config_dict=classifier_config_dict) tpot.fit(test_X, test_y)
          -

          Pipeline caching in TPOT

          With the memory parameter, pipelines can cache the results of each transformer after fitting them. This feature is used to avoid repeated computation by transformers within a pipeline if the parameters and input data are identical to another fitted pipeline during optimization process. TPOT allows users to specify a custom directory path or joblib.Memory in case they want to re-use the memory cache in future TPOT runs (or a warm_start run).

          There are three methods for enabling memory caching in TPOT:

          -
          from tpot import TPOTClassifier
          +
          from tpot import TPOTClassifier
           from tempfile import mkdtemp
           from joblib import Memory
           from shutil import rmtree
          @@ -709,13 +702,12 @@ 

          Pipeline caching in TPOT

          # Clear the cache directory when you don't need it anymore rmtree(cachedir)
          -

          Note: TPOT does NOT clean up memory caches if users set a custom directory path or Memory object. We recommend that you clean up the memory caches when you don't need it anymore.

          Crash/freeze issue with n_jobs > 1 under OSX or Linux

          Internally, TPOT uses joblib to fit estimators in parallel. This is the same parallelization framework used by scikit-learn. But it may crash/freeze with n_jobs > 1 under OSX or Linux as scikit-learn does, especially with large datasets.

          One solution is to configure Python's multiprocessing module to use the forkserver start method (instead of the default fork) to manage the process pools. You can enable the forkserver mode globally for your program by putting the following codes into your main script:

          -
          import multiprocessing
          +
          import multiprocessing
           
           # other imports, custom code, load data, define model...
           
          @@ -724,21 +716,19 @@ 

          Crash/freeze issue w # call scikit-learn utils or tpot utils with n_jobs > 1 here

          -

          More information about these start methods can be found in the multiprocessing documentation.

          Parallel Training with Dask

          For large problems or working on Jupyter notebook, we highly recommend that you can distribute the work on a Dask cluster. The dask-examples binder has a runnable example with a small dask cluster.

          To use your Dask cluster to fit a TPOT model, specify the use_dask keyword when you create the TPOT estimator. Note: if use_dask=True, TPOT will use as many cores as available on the your Dask cluster. If n_jobs is specified, then it will control the chunk size (10*n_jobs if it is less then offspring size) of parallel training.

          -
          estimator = TPOTEstimator(use_dask=True, n_jobs=-1)
          +
          estimator = TPOTEstimator(use_dask=True, n_jobs=-1)
           
          - -

          This will use use all the workers on your cluster to do the training, and use Dask-ML's pipeline rewriting to avoid re-fitting estimators multiple times on the same set of data. +

          This will use all the workers on your cluster to do the training, and use Dask-ML's pipeline rewriting to avoid re-fitting estimators multiple times on the same set of data. It will also provide fine-grained diagnostics in the distributed scheduler UI.

          Alternatively, Dask implements a joblib backend. You can instruct TPOT to use the distributed backend during training by specifying a joblib.parallel_backend:

          -
          import joblib
          +
          import joblib
           import distributed.joblib
           from dask.distributed import Client
           
          @@ -752,7 +742,6 @@ 

          Parallel Training with Dask

          with joblib.parallel_backend("dask"): estimator.fit(X, y)
          -

          See dask's distributed joblib integration for more.

          Neural Networks in TPOT (tpot.nn)

          Support for neural network models and deep learning is an experimental feature newly added to TPOT. Available neural network architectures are provided by the tpot.nn module. Unlike regular sklearn estimators, these models need to be written by hand, and must also inherit the appropriate base classes provided by sklearn for all of their built-in modules. In other words, they need implement methods like .fit(), fit_transform(), get_params(), etc., as described in detail on Developing scikit-learn estimators.

          @@ -766,13 +755,12 @@

          Telling TPOT

          Use a configuration dictionary that includes one or more tpot.nn estimators, either by writing one manually, including one from a file, or by importing the configuration in tpot/config/classifier_nn.py. A very simple example that will force TPOT to only use a PyTorch-based logistic regression classifier as its main estimator is as follows:

        -
        tpot_config = {
        +
        tpot_config = {
             'tpot.nn.PytorchLRClassifier': {
                 'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.]
             }
         }
         
        -
        • Alternatively, use a template string including PytorchLRClassifier or PytorchMLPClassifier while loading the TPOT-NN configuration dictionary.
        diff --git a/tutorials/cuML_Classification_Example.ipynb b/tutorials/cuML_Classification_Example.ipynb index 23761120..d0212f9f 100644 --- a/tutorials/cuML_Classification_Example.ipynb +++ b/tutorials/cuML_Classification_Example.ipynb @@ -90,8 +90,8 @@ "source": [ "# TPOT setup\n", "GENERATIONS = 5\n", - "POP_SIZE = 5\n", - "CV = 2\n", + "POP_SIZE = 100\n", + "CV = 5\n", "\n", "tpot = TPOTClassifier(\n", " generations=GENERATIONS,\n", diff --git a/tutorials/cuML_Regression_Example.ipynb b/tutorials/cuML_Regression_Example.ipynb index a28bfa2c..834efc5a 100644 --- a/tutorials/cuML_Regression_Example.ipynb +++ b/tutorials/cuML_Regression_Example.ipynb @@ -87,8 +87,8 @@ "source": [ "# TPOT setup\n", "GENERATIONS = 5\n", - "POP_SIZE = 5\n", - "CV = 2\n", + "POP_SIZE = 100\n", + "CV = 5\n", "\n", "tpot = TPOTRegressor(\n", " generations=GENERATIONS,\n",