From 4018ef4ca59c632b838643ac2505c1b85279b405 Mon Sep 17 00:00:00 2001 From: Bharath Ramsundar Date: Tue, 19 Jul 2016 16:48:52 -0700 Subject: [PATCH] Renamed DataFeaturizer to DataLoader --- deepchem/datasets/bace_datasets.py | 4 +- deepchem/datasets/muv_datasets.py | 10 +-- deepchem/datasets/nci_datasets.py | 10 +-- deepchem/datasets/pcba_datasets.py | 10 +-- deepchem/datasets/pdbbind_datasets.py | 1 - deepchem/datasets/tests/__init__.py | 8 +-- deepchem/datasets/tests/test_datasets.py | 2 +- deepchem/datasets/tests/test_drop.py | 10 +-- deepchem/datasets/tests/test_load.py | 34 +++++----- deepchem/datasets/tests/test_merge.py | 24 +++---- deepchem/datasets/tests/test_reload.py | 10 +-- deepchem/datasets/tests/test_shuffle.py | 10 +-- deepchem/datasets/tox21_datasets.py | 10 +-- deepchem/featurizers/featurize.py | 2 +- .../featurizers/tests/test_data_featurizer.py | 14 ++--- .../tests/test_featurized_samples.py | 42 ++++++------- deepchem/featurizers/tests/test_sdf_reader.py | 12 ++-- .../tests/test_hyperparam_opt.py | 28 ++++----- deepchem/models/tests/__init__.py | 1 - deepchem/models/tests/test_api.py | 62 +++++++++---------- deepchem/models/tests/test_multitask.py | 12 ++-- deepchem/scripts/featurize_pdbbind.py | 18 +++--- examples/bace/bace_dnn.py | 1 - examples/pcba/pcba_sklearn.py | 2 - examples/pdbbind_nnscore.py | 14 ++--- 25 files changed, 173 insertions(+), 178 deletions(-) diff --git a/deepchem/datasets/bace_datasets.py b/deepchem/datasets/bace_datasets.py index 77757257b9..195763bc64 100644 --- a/deepchem/datasets/bace_datasets.py +++ b/deepchem/datasets/bace_datasets.py @@ -11,7 +11,7 @@ from deepchem.utils.save import load_from_disk from deepchem.splits import SpecifiedSplitter from deepchem.featurizers import UserDefinedFeaturizer -from deepchem.featurizers.featurize import DataFeaturizer +from deepchem.featurizers.featurize import DataLoader from deepchem.datasets import Dataset from deepchem.transformers import NormalizationTransformer from deepchem.transformers import ClippingTransformer @@ -69,7 +69,7 @@ def load_bace(mode="regression", transform=True, split="20-80"): else: raise ValueError("Unknown mode %s" % mode) featurizer = UserDefinedFeaturizer(user_specified_features) - loader = DataFeaturizer(tasks=bace_tasks, + loader = DataLoader(tasks=bace_tasks, smiles_field="mol", id_field="CID", featurizer=featurizer) diff --git a/deepchem/datasets/muv_datasets.py b/deepchem/datasets/muv_datasets.py index 7c867da6ed..3ee7545f8e 100644 --- a/deepchem/datasets/muv_datasets.py +++ b/deepchem/datasets/muv_datasets.py @@ -10,7 +10,7 @@ import shutil from deepchem.utils.save import load_from_disk from deepchem.datasets import Dataset -from deepchem.featurizers.featurize import DataFeaturizer +from deepchem.featurizers.featurize import DataLoader from deepchem.featurizers.fingerprints import CircularFingerprint from deepchem.transformers import BalancingTransformer @@ -49,10 +49,10 @@ def load_muv(base_dir, reload=True): 'MUV-737', 'MUV-858', 'MUV-713', 'MUV-733', 'MUV-652', 'MUV-466', 'MUV-832']) - loader = DataFeaturizer(tasks=all_MUV_tasks, - smiles_field="smiles", - featurizer=featurizer, - verbosity=verbosity) + loader = DataLoader(tasks=all_MUV_tasks, + smiles_field="smiles", + featurizer=featurizer, + verbosity=verbosity) if not reload or not os.path.exists(data_dir): dataset = loader.featurize(dataset_file, data_dir) regen = True diff --git a/deepchem/datasets/nci_datasets.py b/deepchem/datasets/nci_datasets.py index 1c38de60f0..4cbc79ab09 100644 --- a/deepchem/datasets/nci_datasets.py +++ b/deepchem/datasets/nci_datasets.py @@ -13,7 +13,7 @@ import shutil from deepchem.utils.save import load_sharded_csv from deepchem.datasets import Dataset -from deepchem.featurizers.featurize import DataFeaturizer +from deepchem.featurizers.featurize import DataLoader from deepchem.featurizers.fingerprints import CircularFingerprint from deepchem.transformers import NormalizationTransformer @@ -65,10 +65,10 @@ def load_nci(base_dir, reload=True, force_transform=False): 'MDA-MB-231/ATCC', 'MDA-MB-468', 'HS 578T', 'BT-549', 'T-47D']) - loader = DataFeaturizer(tasks=all_nci_tasks, - smiles_field="smiles", - featurizer=featurizer, - verbosity=verbosity) + loader = DataLoader(tasks=all_nci_tasks, + smiles_field="smiles", + featurizer=featurizer, + verbosity=verbosity) if not reload or not os.path.exists(data_dir): dataset = loader.featurize(dataset_paths, data_dir) regen = True diff --git a/deepchem/datasets/pcba_datasets.py b/deepchem/datasets/pcba_datasets.py index 7a6a537108..68e81cc1dc 100644 --- a/deepchem/datasets/pcba_datasets.py +++ b/deepchem/datasets/pcba_datasets.py @@ -10,7 +10,7 @@ import shutil from deepchem.utils.save import load_from_disk from deepchem.datasets import Dataset -from deepchem.featurizers.featurize import DataFeaturizer +from deepchem.featurizers.featurize import DataLoader from deepchem.featurizers.fingerprints import CircularFingerprint from deepchem.transformers import BalancingTransformer @@ -70,10 +70,10 @@ def load_pcba(base_dir, reload=True): 'PCBA-902','PCBA-903','PCBA-904','PCBA-912','PCBA-914','PCBA-915', 'PCBA-924','PCBA-925','PCBA-926','PCBA-927','PCBA-938','PCBA-995'] - loader = DataFeaturizer(tasks=all_PCBA_tasks, - smiles_field="smiles", - featurizer=featurizer, - verbosity=verbosity) + loader = DataLoader(tasks=all_PCBA_tasks, + smiles_field="smiles", + featurizer=featurizer, + verbosity=verbosity) if not reload or not os.path.exists(data_dir): dataset = loader.featurize(dataset_file, data_dir) regen = True diff --git a/deepchem/datasets/pdbbind_datasets.py b/deepchem/datasets/pdbbind_datasets.py index 0f1187057a..1f13166585 100644 --- a/deepchem/datasets/pdbbind_datasets.py +++ b/deepchem/datasets/pdbbind_datasets.py @@ -13,7 +13,6 @@ from rdkit import Chem from deepchem.utils.save import load_from_disk from deepchem.datasets import Dataset -from deepchem.featurizers.featurize import DataFeaturizer from deepchem.featurizers.fingerprints import CircularFingerprint from deepchem.transformers import BalancingTransformer from deepchem.featurizers.nnscore import NNScoreComplexFeaturizer diff --git a/deepchem/datasets/tests/__init__.py b/deepchem/datasets/tests/__init__.py index 9e486125b9..6ea227c4b1 100644 --- a/deepchem/datasets/tests/__init__.py +++ b/deepchem/datasets/tests/__init__.py @@ -15,7 +15,7 @@ import shutil import numpy as np from deepchem.datasets import Dataset -from deepchem.featurizers.featurize import DataFeaturizer +from deepchem.featurizers.featurize import DataLoader from deepchem.featurizers.fingerprints import CircularFingerprint from deepchem.transformers import NormalizationTransformer from deepchem.models.tests import TestAPI @@ -33,7 +33,7 @@ def load_solubility_data(self): tasks = ["log-solubility"] task_type = "regression" input_file = os.path.join(self.current_dir, "../../models/tests/example.csv") - featurizer = DataFeaturizer( + featurizer = DataLoader( tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, @@ -50,7 +50,7 @@ def load_classification_data(self): task_type = "classification" input_file = os.path.join( self.current_dir, "../../models/tests/example_classification.csv") - loader = DataFeaturizer( + loader = DataLoader( tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, @@ -67,7 +67,7 @@ def load_multitask_data(self): "task13", "task14", "task15", "task16"] input_file = os.path.join( self.current_dir, "../../models/tests/multitask_example.csv") - loader = DataFeaturizer( + loader = DataLoader( tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, diff --git a/deepchem/datasets/tests/test_datasets.py b/deepchem/datasets/tests/test_datasets.py index 14480c5a00..936647e41a 100644 --- a/deepchem/datasets/tests/test_datasets.py +++ b/deepchem/datasets/tests/test_datasets.py @@ -15,7 +15,7 @@ import shutil import numpy as np from deepchem.datasets import Dataset -from deepchem.featurizers.featurize import DataFeaturizer +from deepchem.featurizers.featurize import DataLoader from deepchem.featurizers.fingerprints import CircularFingerprint from deepchem.transformers import NormalizationTransformer from deepchem.datasets.tests import TestDatasetAPI diff --git a/deepchem/datasets/tests/test_drop.py b/deepchem/datasets/tests/test_drop.py index de8e8d4e4f..dbe6835ab7 100644 --- a/deepchem/datasets/tests/test_drop.py +++ b/deepchem/datasets/tests/test_drop.py @@ -4,7 +4,7 @@ import numpy as np from deepchem.models.tests import TestAPI from deepchem.utils.save import load_from_disk -from deepchem.featurizers.featurize import DataFeaturizer +from deepchem.featurizers.featurize import DataLoader from deepchem.datasets import Dataset from sklearn.ensemble import RandomForestClassifier from deepchem.models.sklearn_models import SklearnModel @@ -37,10 +37,10 @@ def test_drop(self): featurizer = CircularFingerprint(size=1024) emols_tasks = ['activity'] - loader = DataFeaturizer(tasks=emols_tasks, - smiles_field="smiles", - featurizer=featurizer, - verbosity=verbosity) + loader = DataLoader(tasks=emols_tasks, + smiles_field="smiles", + featurizer=featurizer, + verbosity=verbosity) dataset = loader.featurize(dataset_file, data_dir, debug=True, logging=False) X, y, w, ids = dataset.to_numpy() diff --git a/deepchem/datasets/tests/test_load.py b/deepchem/datasets/tests/test_load.py index 23ed5cddb1..48df0bc35f 100644 --- a/deepchem/datasets/tests/test_load.py +++ b/deepchem/datasets/tests/test_load.py @@ -16,7 +16,7 @@ from deepchem.models.tests import TestAPI from deepchem.utils.save import load_from_disk from deepchem.featurizers.fingerprints import CircularFingerprint -from deepchem.featurizers.featurize import DataFeaturizer +from deepchem.featurizers.featurize import DataLoader from deepchem.datasets import Dataset ## task0: 1,1,0,-,0,-,1,-,-,1 @@ -37,10 +37,10 @@ def test_move_load(self): featurizer = CircularFingerprint(size=1024) tasks = ["log-solubility"] - loader = DataFeaturizer(tasks=tasks, - smiles_field="smiles", - featurizer=featurizer, - verbosity=verbosity) + loader = DataLoader(tasks=tasks, + smiles_field="smiles", + featurizer=featurizer, + verbosity=verbosity) dataset = loader.featurize(dataset_file, data_dir) X, y, w, ids = dataset.to_numpy() @@ -92,10 +92,10 @@ def test_multiload(self): all_tasks = ["task%d"%i for i in range(17)] ####### Do featurization - loader = DataFeaturizer(tasks=all_tasks, - smiles_field="smiles", - featurizer=featurizer, - verbosity=verbosity) + loader = DataLoader(tasks=all_tasks, + smiles_field="smiles", + featurizer=featurizer, + verbosity=verbosity) dataset = loader.featurize( dataset_file, data_dir) @@ -160,10 +160,10 @@ def test_singletask_matches_multitask_load(self): tasks = all_tasks[0:n_tasks] ####### Do multitask load - loader = DataFeaturizer(tasks=tasks, - smiles_field="smiles", - featurizer=featurizer, - verbosity=verbosity) + loader = DataLoader(tasks=tasks, + smiles_field="smiles", + featurizer=featurizer, + verbosity=verbosity) dataset = loader.featurize(dataset_file, data_dir) # Do train/valid split. @@ -176,10 +176,10 @@ def test_singletask_matches_multitask_load(self): print("Processing task %s" % task) if os.path.exists(data_dir): shutil.rmtree(data_dir) - loader = DataFeaturizer(tasks=[task], - smiles_field="smiles", - featurizer=featurizer, - verbosity=verbosity) + loader = DataLoader(tasks=[task], + smiles_field="smiles", + featurizer=featurizer, + verbosity=verbosity) dataset = loader.featurize(dataset_file, data_dir) X_task, y_task, w_task, ids_task = dataset.to_numpy() diff --git a/deepchem/datasets/tests/test_merge.py b/deepchem/datasets/tests/test_merge.py index a56f9277e0..9c7dbcde28 100644 --- a/deepchem/datasets/tests/test_merge.py +++ b/deepchem/datasets/tests/test_merge.py @@ -16,7 +16,7 @@ from deepchem.models.tests import TestAPI from deepchem.utils.save import load_from_disk from deepchem.featurizers.fingerprints import CircularFingerprint -from deepchem.featurizers.featurize import DataFeaturizer +from deepchem.featurizers.featurize import DataLoader from deepchem.datasets import Dataset class TestMerge(TestAPI): @@ -36,13 +36,13 @@ def test_merge(self): featurizer = CircularFingerprint(size=1024) tasks = ["log-solubility"] - featurizer = DataFeaturizer(tasks=tasks, - smiles_field="smiles", - featurizer=featurizer, - verbosity=verbosity) - first_dataset = featurizer.featurize( + loader = DataLoader(tasks=tasks, + smiles_field="smiles", + featurizer=featurizer, + verbosity=verbosity) + first_dataset = loader.featurize( dataset_file, first_data_dir) - second_dataset = featurizer.featurize( + second_dataset = loader.featurize( dataset_file, second_data_dir) merged_dataset = Dataset.merge( @@ -62,11 +62,11 @@ def test_subset(self): featurizer = CircularFingerprint(size=1024) tasks = ["log-solubility"] - featurizer = DataFeaturizer(tasks=tasks, - smiles_field="smiles", - featurizer=featurizer, - verbosity=verbosity) - dataset = featurizer.featurize( + loader = DataLoader(tasks=tasks, + smiles_field="smiles", + featurizer=featurizer, + verbosity=verbosity) + dataset = loader.featurize( dataset_file, data_dir, shard_size=2) shard_nums = [1, 2] diff --git a/deepchem/datasets/tests/test_reload.py b/deepchem/datasets/tests/test_reload.py index 7dd6806109..7669995b85 100644 --- a/deepchem/datasets/tests/test_reload.py +++ b/deepchem/datasets/tests/test_reload.py @@ -17,7 +17,7 @@ from deepchem.models.tests import TestAPI from deepchem.utils.save import load_from_disk from deepchem.datasets import Dataset -from deepchem.featurizers.featurize import DataFeaturizer +from deepchem.featurizers.featurize import DataLoader from deepchem.featurizers.fingerprints import CircularFingerprint from deepchem.splits import ScaffoldSplitter from deepchem.datasets import Dataset @@ -45,10 +45,10 @@ def _run_muv_experiment(self, dataset_file, reload=False, verbosity=None): 'MUV-548', 'MUV-852', 'MUV-600', 'MUV-810', 'MUV-712', 'MUV-737', 'MUV-858', 'MUV-713', 'MUV-733', 'MUV-652', 'MUV-466', 'MUV-832'] - loader = DataFeaturizer(tasks=MUV_tasks, - smiles_field="smiles", - featurizer=featurizer, - verbosity=verbosity) + loader = DataLoader(tasks=MUV_tasks, + smiles_field="smiles", + featurizer=featurizer, + verbosity=verbosity) dataset = loader.featurize(dataset_file, self.data_dir) assert len(dataset) == len(raw_dataset) diff --git a/deepchem/datasets/tests/test_shuffle.py b/deepchem/datasets/tests/test_shuffle.py index 9e7d0ca215..092de3799d 100644 --- a/deepchem/datasets/tests/test_shuffle.py +++ b/deepchem/datasets/tests/test_shuffle.py @@ -16,7 +16,7 @@ from deepchem.models.tests import TestAPI from deepchem.utils.save import load_from_disk from deepchem.featurizers.fingerprints import CircularFingerprint -from deepchem.featurizers.featurize import DataFeaturizer +from deepchem.featurizers.featurize import DataLoader from deepchem.datasets import Dataset class TestShuffle(TestAPI): @@ -34,10 +34,10 @@ def test_shuffle(self): featurizer = CircularFingerprint(size=1024) tasks = ["log-solubility"] - loader = DataFeaturizer(tasks=tasks, - smiles_field="smiles", - featurizer=featurizer, - verbosity=verbosity) + loader = DataLoader(tasks=tasks, + smiles_field="smiles", + featurizer=featurizer, + verbosity=verbosity) dataset = loader.featurize( dataset_file, data_dir, shard_size=2) diff --git a/deepchem/datasets/tox21_datasets.py b/deepchem/datasets/tox21_datasets.py index dad41ee6c0..ee38d57f1e 100644 --- a/deepchem/datasets/tox21_datasets.py +++ b/deepchem/datasets/tox21_datasets.py @@ -12,7 +12,7 @@ from sklearn.ensemble import RandomForestClassifier from deepchem.utils.save import load_from_disk from deepchem.datasets import Dataset -from deepchem.featurizers.featurize import DataFeaturizer +from deepchem.featurizers.featurize import DataLoader from deepchem.featurizers.fingerprints import CircularFingerprint from deepchem.splits import ScaffoldSplitter from deepchem.splits import RandomSplitter @@ -62,10 +62,10 @@ def load_tox21(base_dir, reload=True): 'SR-HSE', 'SR-MMP', 'SR-p53'] if not reload or not os.path.exists(data_dir): - loader = DataFeaturizer(tasks=all_tox21_tasks, - smiles_field="smiles", - featurizer=featurizer, - verbosity=verbosity) + loader = DataLoader(tasks=all_tox21_tasks, + smiles_field="smiles", + featurizer=featurizer, + verbosity=verbosity) dataset = loader.featurize( dataset_file, data_dir, shard_size=8192) else: diff --git a/deepchem/featurizers/featurize.py b/deepchem/featurizers/featurize.py index 06abf2a686..6c530f590c 100644 --- a/deepchem/featurizers/featurize.py +++ b/deepchem/featurizers/featurize.py @@ -124,7 +124,7 @@ def featurize_map_function(args): # else: # raise ValueError("Field of unrecognized type: %s" % str(val)) -class DataFeaturizer(object): +class DataLoader(object): """ Handles loading/featurizing of chemical samples (datapoints). diff --git a/deepchem/featurizers/tests/test_data_featurizer.py b/deepchem/featurizers/tests/test_data_featurizer.py index e4733c6902..eef62e8ac5 100644 --- a/deepchem/featurizers/tests/test_data_featurizer.py +++ b/deepchem/featurizers/tests/test_data_featurizer.py @@ -1,5 +1,5 @@ """ -Tests for DataFeaturizer class +Tests for DataLoader class """ from __future__ import print_function from __future__ import division @@ -14,10 +14,10 @@ import tempfile import shutil from deepchem.models.tests import TestAPI -from deepchem.featurizers.featurize import DataFeaturizer +from deepchem.featurizers.featurize import DataLoader from deepchem.featurizers.fingerprints import CircularFingerprint -class TestDataFeaturizer(TestAPI): +class TestDataLoader(TestAPI): """ Test Data Featurizer class. """ @@ -29,10 +29,10 @@ def test_log_solubility_dataset(self): tasks = ["log-solubility"] smiles_field = "smiles" - loader = DataFeaturizer(tasks=tasks, - smiles_field=self.smiles_field, - featurizer=CircularFingerprint(size=1024), - verbosity="low") + loader = DataLoader(tasks=tasks, + smiles_field=self.smiles_field, + featurizer=CircularFingerprint(size=1024), + verbosity="low") dataset = loader.featurize(input_file, self.data_dir) assert len(dataset) == 10 diff --git a/deepchem/featurizers/tests/test_featurized_samples.py b/deepchem/featurizers/tests/test_featurized_samples.py index 641d04a1dd..408c5cc9ec 100644 --- a/deepchem/featurizers/tests/test_featurized_samples.py +++ b/deepchem/featurizers/tests/test_featurized_samples.py @@ -18,7 +18,7 @@ from deepchem.splits import RandomSplitter from deepchem.splits import ScaffoldSplitter from deepchem.splits import SpecifiedSplitter -from deepchem.featurizers.featurize import DataFeaturizer +from deepchem.featurizers.featurize import DataLoader from deepchem.featurizers.fingerprints import CircularFingerprint #from deepchem.featurizers.featurize import FeaturizedSamples @@ -40,10 +40,10 @@ def scaffold_test_train_valid_test_split(self): featurizer = CircularFingerprint(size=1024) input_file = os.path.join(self.current_dir, input_file) - loader = DataFeaturizer(tasks=tasks, - smiles_field=self.smiles_field, - featurizer=featurizer, - verbosity="low") + loader = DataLoader(tasks=tasks, + smiles_field=self.smiles_field, + featurizer=featurizer, + verbosity="low") dataset = loader.featurize(input_file, self.data_dir) @@ -68,10 +68,10 @@ def scaffold_test_train_test_split(self): featurizer = CircularFingerprint(size=1024) input_file = os.path.join(self.current_dir, input_file) - loader = DataFeaturizer(tasks=tasks, - smiles_field=self.smiles_field, - featurizer=featurizer, - verbosity="low") + loader = DataLoader(tasks=tasks, + smiles_field=self.smiles_field, + featurizer=featurizer, + verbosity="low") dataset = loader.featurize(input_file, self.data_dir) @@ -94,10 +94,10 @@ def random_test_train_valid_test_split(self): featurizer = CircularFingerprint(size=1024) input_file = os.path.join(self.current_dir, input_file) - loader = DataFeaturizer(tasks=tasks, - smiles_field=self.smiles_field, - featurizer=featurizer, - verbosity="low") + loader = DataLoader(tasks=tasks, + smiles_field=self.smiles_field, + featurizer=featurizer, + verbosity="low") dataset = loader.featurize(input_file, self.data_dir) @@ -118,10 +118,10 @@ def random_test_train_test_split(self): task_types = {task: task_type for task in tasks} input_file = os.path.join(self.current_dir, "example.csv") featurizer = CircularFingerprint(size=1024) - loader = DataFeaturizer(tasks=tasks, - smiles_field=self.smiles_field, - featurizer=featurizer, - verbosity="low") + loader = DataLoader(tasks=tasks, + smiles_field=self.smiles_field, + featurizer=featurizer, + verbosity="low") dataset = loader.featurize(input_file, self.data_dir) @@ -142,10 +142,10 @@ def test_samples_move(self): featurizer = CircularFingerprint(size=1024) tasks = ["log-solubility"] - loader = DataFeaturizer(tasks=tasks, - smiles_field="smiles", - featurizer=featurizer, - verbosity=verbosity) + loader = DataLoader(tasks=tasks, + smiles_field="smiles", + featurizer=featurizer, + verbosity=verbosity) featurized_dataset = loader.featurize( dataset_file, data_dir) n_dataset = len(featurized_dataset) diff --git a/deepchem/featurizers/tests/test_sdf_reader.py b/deepchem/featurizers/tests/test_sdf_reader.py index 7c1415f6f1..317c5af357 100644 --- a/deepchem/featurizers/tests/test_sdf_reader.py +++ b/deepchem/featurizers/tests/test_sdf_reader.py @@ -14,7 +14,7 @@ import tempfile import shutil from deepchem.splits import RandomSplitter -from deepchem.featurizers.featurize import DataFeaturizer +from deepchem.featurizers.featurize import DataLoader from deepchem.featurizers.coulomb_matrices import CoulombMatrixEig from deepchem.models.tests import TestAPI @@ -38,11 +38,11 @@ def random_test_train_valid_test_split_from_sdf(self): featurizer = CoulombMatrixEig(6, remove_hydrogens=False) input_file = os.path.join(self.current_dir, input_file) - loader = DataFeaturizer(tasks=tasks, - smiles_field=self.smiles_field, - mol_field="mol", - featurizer=featurizer, - verbosity="low") + loader = DataLoader(tasks=tasks, + smiles_field=self.smiles_field, + mol_field="mol", + featurizer=featurizer, + verbosity="low") dataset = loader.featurize(input_file, self.data_dir) diff --git a/deepchem/hyperparameters/tests/test_hyperparam_opt.py b/deepchem/hyperparameters/tests/test_hyperparam_opt.py index d4555c5074..791ee36029 100644 --- a/deepchem/hyperparameters/tests/test_hyperparam_opt.py +++ b/deepchem/hyperparameters/tests/test_hyperparam_opt.py @@ -17,7 +17,7 @@ from deepchem.models.tests import TestAPI from deepchem.models.sklearn_models import SklearnModel from deepchem.featurizers.fingerprints import CircularFingerprint -from deepchem.featurizers.featurize import DataFeaturizer +from deepchem.featurizers.featurize import DataLoader from deepchem.transformers import NormalizationTransformer from deepchem import metrics from deepchem.metrics import Metric @@ -56,10 +56,10 @@ def test_singletask_sklearn_rf_ECFP_regression_hyperparam_opt(self): task_type = "regression" task_types = {task: task_type for task in tasks} input_file = os.path.join(self.current_dir, "example.csv") - featurizer = DataFeaturizer(tasks=tasks, - smiles_field=self.smiles_field, - featurizer=featurizer, - verbosity="low") + loader = DataLoader(tasks=tasks, + smiles_field=self.smiles_field, + featurizer=featurizer, + verbosity="low") dataset = featurizer.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() @@ -147,10 +147,10 @@ def test_multitask_keras_mlp_ECFP_classification_hyperparam_opt(self): task_types = {task: task_type for task in tasks} featurizer = CircularFingerprint(size=1024) - loader = DataFeaturizer(tasks=tasks, - smiles_field=self.smiles_field, - featurizer=featurizer, - verbosity="low") + loader = DataLoader(tasks=tasks, + smiles_field=self.smiles_field, + featurizer=featurizer, + verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() @@ -192,11 +192,11 @@ def test_multitask_tf_mlp_ECFP_classification_hyperparam_opt(self): featurizer = CircularFingerprint(size=1024) - data_featurizer = DataFeaturizer(tasks=tasks, - smiles_field=self.smiles_field, - featurizer=featurizer, - verbosity="low") - dataset = data_featurizer.featurize(input_file, self.data_dir) + loader = DataLoader(tasks=tasks, + smiles_field=self.smiles_field, + featurizer=featurizer, + verbosity="low") + dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( diff --git a/deepchem/models/tests/__init__.py b/deepchem/models/tests/__init__.py index 2170bf488c..9f35b07ef4 100644 --- a/deepchem/models/tests/__init__.py +++ b/deepchem/models/tests/__init__.py @@ -13,7 +13,6 @@ import unittest import tempfile import shutil -from deepchem.featurizers.featurize import DataFeaturizer from deepchem.datasets import Dataset from deepchem.utils.evaluate import Evaluator from deepchem.models import Model diff --git a/deepchem/models/tests/test_api.py b/deepchem/models/tests/test_api.py index d0c75d73f0..30c3c49cc1 100644 --- a/deepchem/models/tests/test_api.py +++ b/deepchem/models/tests/test_api.py @@ -14,7 +14,7 @@ import tempfile import shutil from deepchem.featurizers import UserDefinedFeaturizer -from deepchem.featurizers.featurize import DataFeaturizer +from deepchem.featurizers.featurize import DataLoader from deepchem.featurizers.fingerprints import CircularFingerprint from deepchem.featurizers.basic import RDKitDescriptors from deepchem.featurizers.grid_featurizer import GridFeaturizer @@ -47,10 +47,10 @@ def test_singletask_sklearn_rf_ECFP_regression_API(self): task_type = "regression" task_types = {task: task_type for task in tasks} input_file = os.path.join(self.current_dir, "example.csv") - featurizer = DataFeaturizer(tasks=tasks, - smiles_field=self.smiles_field, - featurizer=featurizer, - verbosity="low") + loader = DataLoader(tasks=tasks, + smiles_field=self.smiles_field, + featurizer=featurizer, + verbosity="low") dataset = featurizer.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() @@ -91,10 +91,10 @@ def test_singletask_sklearn_rf_user_specified_regression_API(self): task_type = "regression" task_types = {task: task_type for task in tasks} input_file = os.path.join(self.current_dir, "user_specified_example.csv") - featurizer = DataFeaturizer(tasks=tasks, - smiles_field=self.smiles_field, - featurizer=featurizer, - verbosity="low") + loader = DataLoader(tasks=tasks, + smiles_field=self.smiles_field, + featurizer=featurizer, + verbosity="low") dataset = featurizer.featurize(input_file, self.data_dir, debug=True) splitter = SpecifiedSplitter(input_file, "split") @@ -141,11 +141,11 @@ def test_singletask_sklearn_rf_ECFP_regression_sharded_API(self): input_file = os.path.join( self.current_dir, "../../../datasets/pdbbind_core_df.pkl.gz") - featurizer = DataFeaturizer(tasks=tasks, - smiles_field=self.smiles_field, - featurizer=featurizer, - verbosity="low") - dataset = featurizer.featurize(input_file, self.data_dir) + loader = DataLoader(tasks=tasks, + smiles_field=self.smiles_field, + featurizer=featurizer, + verbosity="low") + dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( @@ -189,10 +189,10 @@ def test_singletask_sklearn_rf_RDKIT_descriptor_regression_API(self): task_types = {task: task_type for task in tasks} model_params = {} input_file = os.path.join(self.current_dir, "example.csv") - featurizer = DataFeaturizer(tasks=tasks, - smiles_field=self.smiles_field, - featurizer=featurizer, - verbosity="low") + loader = DataLoader(tasks=tasks, + smiles_field=self.smiles_field, + featurizer=featurizer, + verbosity="low") dataset = featurizer.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() @@ -249,10 +249,10 @@ def test_singletask_sklearn_rf_RDKIT_descriptor_regression_API(self): # "nb_layers": 1, "batchnorm": False} # input_file = os.path.join(self.current_dir, "gbd3k.pkl.gz") - # featurizer = DataFeaturizer(tasks=tasks, - # smiles_field=self.smiles_field, - # featurizer=featurizer, - # verbosity="low") + # featurizer = DataLoader(tasks=tasks, + # smiles_field=self.smiles_field, + # featurizer=featurizer, + # verbosity="low") # dataset = featurizer.featurize(input_file, self.data_dir) # splitter = ScaffoldSplitter() @@ -311,10 +311,10 @@ def test_multitask_keras_mlp_ECFP_classification_API(self): featurizer = CircularFingerprint(size=1024) - loader = DataFeaturizer(tasks=tasks, - smiles_field=self.smiles_field, - featurizer=featurizer, - verbosity="low") + loader = DataLoader(tasks=tasks, + smiles_field=self.smiles_field, + featurizer=featurizer, + verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( @@ -355,11 +355,11 @@ def test_singletask_tf_mlp_ECFP_classification_API(self): task_types = {task: task_type for task in tasks} input_file = os.path.join(self.current_dir, "example_classification.csv") - featurizer = DataFeaturizer(tasks=tasks, - smiles_field=self.smiles_field, - featurizer=featurizer, - verbosity="low") - dataset = featurizer.featurize(input_file, self.data_dir) + loader = DataLoader(tasks=tasks, + smiles_field=self.smiles_field, + featurizer=featurizer, + verbosity="low") + dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( diff --git a/deepchem/models/tests/test_multitask.py b/deepchem/models/tests/test_multitask.py index 1af3547c77..a0402925f7 100644 --- a/deepchem/models/tests/test_multitask.py +++ b/deepchem/models/tests/test_multitask.py @@ -14,7 +14,7 @@ import tempfile import shutil from deepchem.featurizers.fingerprints import CircularFingerprint -from deepchem.featurizers.featurize import DataFeaturizer +from deepchem.featurizers.featurize import DataLoader from deepchem.datasets import Dataset from deepchem.models.tests import TestAPI from deepchem.splits import ScaffoldSplitter @@ -47,11 +47,11 @@ def test_multitask_order(self): featurizer = CircularFingerprint(size=1024) - featurizer = DataFeaturizer(tasks=tasks, - smiles_field=self.smiles_field, - featurizer=featurizer, - verbosity="low") - dataset = featurizer.featurize(input_file, self.data_dir) + loader = DataLoader(tasks=tasks, + smiles_field=self.smiles_field, + featurizer=featurizer, + verbosity="low") + dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( diff --git a/deepchem/scripts/featurize_pdbbind.py b/deepchem/scripts/featurize_pdbbind.py index f6cd976324..15829aecd1 100644 --- a/deepchem/scripts/featurize_pdbbind.py +++ b/deepchem/scripts/featurize_pdbbind.py @@ -4,6 +4,7 @@ from deepchem.featurizers.basic import RDKitDescriptors from deepchem.featurizers.nnscore import NNScoreComplexFeaturizer from deepchem.featurizers.grid_featurizer import GridFeaturizer +from deepchem.featurizers.featurize import DataLoader dataset_file = "../../../datasets/pdbbind_full_df.pkl.gz" print("About to load dataset form disk.") @@ -37,17 +38,16 @@ os.makedirs(samples_dir) -from deepchem.featurizers.featurize import DataFeaturizer featurizers = compound_featurizers + complex_featurizers -featurizer = DataFeaturizer(tasks=["label"], - smiles_field="smiles", - protein_pdb_field="protein_pdb", - ligand_pdb_field="ligand_pdb", - compound_featurizers=compound_featurizers, - complex_featurizers=complex_featurizers, - id_field="complex_id", - verbose=False) +featurizer = DataLoader(tasks=["label"], + smiles_field="smiles", + protein_pdb_field="protein_pdb", + ligand_pdb_field="ligand_pdb", + compound_featurizers=compound_featurizers, + complex_featurizers=complex_featurizers, + id_field="complex_id", + verbose=False) from ipyparallel import Client c = Client() print("c.ids") diff --git a/examples/bace/bace_dnn.py b/examples/bace/bace_dnn.py index 2d2bdf884a..3e80d6dd6b 100644 --- a/examples/bace/bace_dnn.py +++ b/examples/bace/bace_dnn.py @@ -6,7 +6,6 @@ import numpy.random from deepchem.utils.save import load_from_disk from deepchem.splits import SpecifiedSplitter -from deepchem.featurizers.featurize import DataFeaturizer from deepchem.datasets import Dataset from deepchem.transformers import NormalizationTransformer from deepchem.transformers import ClippingTransformer diff --git a/examples/pcba/pcba_sklearn.py b/examples/pcba/pcba_sklearn.py index 7c164dcf00..69fe3b313d 100644 --- a/examples/pcba/pcba_sklearn.py +++ b/examples/pcba/pcba_sklearn.py @@ -12,7 +12,6 @@ from sklearn.ensemble import RandomForestClassifier from deepchem.utils.save import load_from_disk from deepchem.datasets import Dataset -from deepchem.featurizers.featurize import DataFeaturizer from deepchem.featurizers.fingerprints import CircularFingerprint from deepchem.splits import ScaffoldSplitter from deepchem.splits import RandomSplitter @@ -28,7 +27,6 @@ from deepchem.utils.evaluate import Evaluator from deepchem.datasets.pcba_datasets import load_pcba - np.random.seed(123) # Set some global variables up top diff --git a/examples/pdbbind_nnscore.py b/examples/pdbbind_nnscore.py index 38e9639920..a4e4f65a29 100644 --- a/examples/pdbbind_nnscore.py +++ b/examples/pdbbind_nnscore.py @@ -11,7 +11,7 @@ import tempfile import shutil -from deepchem.featurizers.featurize import DataFeaturizer +from deepchem.featurizers.featurize import DataLoader from deepchem.featurizers.featurize import FeaturizedSamples from deepchem.utils.dataset import Dataset from deepchem.utils.evaluate import Evaluator @@ -26,15 +26,15 @@ def featurize_and_split(input_file, feature_dir, samples_dir, train_dir, output_transforms, tasks, feature_files=None): """Featurize inputs with NNScore and do train-test split.""" - featurizer = DataFeaturizer(tasks=tasks, - smiles_field="smiles", - protein_pdb_field="protein_pdb", - ligand_pdb_field="ligand_pdb", - verbose=True) + loader = DataLoader(tasks=tasks, + smiles_field="smiles", + protein_pdb_field="protein_pdb", + ligand_pdb_field="ligand_pdb", + verbose=True) if feature_files is None: print("About to featurize.") - samples = featurizer.featurize(input_file, feature_dir, + samples = loader.featurize(input_file, feature_dir, samples_dir, shard_size=8) print("Completed Featurization") else: