Skip to content

Commit

Permalink
Renamed DataFeaturizer to DataLoader
Browse files Browse the repository at this point in the history
  • Loading branch information
rbharath committed Jul 19, 2016
1 parent 3dd366c commit 4018ef4
Show file tree
Hide file tree
Showing 25 changed files with 173 additions and 178 deletions.
4 changes: 2 additions & 2 deletions deepchem/datasets/bace_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from deepchem.utils.save import load_from_disk
from deepchem.splits import SpecifiedSplitter
from deepchem.featurizers import UserDefinedFeaturizer
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.featurizers.featurize import DataLoader
from deepchem.datasets import Dataset
from deepchem.transformers import NormalizationTransformer
from deepchem.transformers import ClippingTransformer
Expand Down Expand Up @@ -69,7 +69,7 @@ def load_bace(mode="regression", transform=True, split="20-80"):
else:
raise ValueError("Unknown mode %s" % mode)
featurizer = UserDefinedFeaturizer(user_specified_features)
loader = DataFeaturizer(tasks=bace_tasks,
loader = DataLoader(tasks=bace_tasks,
smiles_field="mol",
id_field="CID",
featurizer=featurizer)
Expand Down
10 changes: 5 additions & 5 deletions deepchem/datasets/muv_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import shutil
from deepchem.utils.save import load_from_disk
from deepchem.datasets import Dataset
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.featurizers.featurize import DataLoader
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.transformers import BalancingTransformer

Expand Down Expand Up @@ -49,10 +49,10 @@ def load_muv(base_dir, reload=True):
'MUV-737', 'MUV-858', 'MUV-713', 'MUV-733', 'MUV-652',
'MUV-466', 'MUV-832'])

loader = DataFeaturizer(tasks=all_MUV_tasks,
smiles_field="smiles",
featurizer=featurizer,
verbosity=verbosity)
loader = DataLoader(tasks=all_MUV_tasks,
smiles_field="smiles",
featurizer=featurizer,
verbosity=verbosity)
if not reload or not os.path.exists(data_dir):
dataset = loader.featurize(dataset_file, data_dir)
regen = True
Expand Down
10 changes: 5 additions & 5 deletions deepchem/datasets/nci_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import shutil
from deepchem.utils.save import load_sharded_csv
from deepchem.datasets import Dataset
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.featurizers.featurize import DataLoader
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.transformers import NormalizationTransformer

Expand Down Expand Up @@ -65,10 +65,10 @@ def load_nci(base_dir, reload=True, force_transform=False):
'MDA-MB-231/ATCC', 'MDA-MB-468', 'HS 578T', 'BT-549',
'T-47D'])

loader = DataFeaturizer(tasks=all_nci_tasks,
smiles_field="smiles",
featurizer=featurizer,
verbosity=verbosity)
loader = DataLoader(tasks=all_nci_tasks,
smiles_field="smiles",
featurizer=featurizer,
verbosity=verbosity)
if not reload or not os.path.exists(data_dir):
dataset = loader.featurize(dataset_paths, data_dir)
regen = True
Expand Down
10 changes: 5 additions & 5 deletions deepchem/datasets/pcba_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import shutil
from deepchem.utils.save import load_from_disk
from deepchem.datasets import Dataset
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.featurizers.featurize import DataLoader
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.transformers import BalancingTransformer

Expand Down Expand Up @@ -70,10 +70,10 @@ def load_pcba(base_dir, reload=True):
'PCBA-902','PCBA-903','PCBA-904','PCBA-912','PCBA-914','PCBA-915',
'PCBA-924','PCBA-925','PCBA-926','PCBA-927','PCBA-938','PCBA-995']

loader = DataFeaturizer(tasks=all_PCBA_tasks,
smiles_field="smiles",
featurizer=featurizer,
verbosity=verbosity)
loader = DataLoader(tasks=all_PCBA_tasks,
smiles_field="smiles",
featurizer=featurizer,
verbosity=verbosity)
if not reload or not os.path.exists(data_dir):
dataset = loader.featurize(dataset_file, data_dir)
regen = True
Expand Down
1 change: 0 additions & 1 deletion deepchem/datasets/pdbbind_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from rdkit import Chem
from deepchem.utils.save import load_from_disk
from deepchem.datasets import Dataset
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.transformers import BalancingTransformer
from deepchem.featurizers.nnscore import NNScoreComplexFeaturizer
Expand Down
8 changes: 4 additions & 4 deletions deepchem/datasets/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import shutil
import numpy as np
from deepchem.datasets import Dataset
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.featurizers.featurize import DataLoader
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.transformers import NormalizationTransformer
from deepchem.models.tests import TestAPI
Expand All @@ -33,7 +33,7 @@ def load_solubility_data(self):
tasks = ["log-solubility"]
task_type = "regression"
input_file = os.path.join(self.current_dir, "../../models/tests/example.csv")
featurizer = DataFeaturizer(
featurizer = DataLoader(
tasks=tasks,
smiles_field=self.smiles_field,
featurizer=featurizer,
Expand All @@ -50,7 +50,7 @@ def load_classification_data(self):
task_type = "classification"
input_file = os.path.join(
self.current_dir, "../../models/tests/example_classification.csv")
loader = DataFeaturizer(
loader = DataLoader(
tasks=tasks,
smiles_field=self.smiles_field,
featurizer=featurizer,
Expand All @@ -67,7 +67,7 @@ def load_multitask_data(self):
"task13", "task14", "task15", "task16"]
input_file = os.path.join(
self.current_dir, "../../models/tests/multitask_example.csv")
loader = DataFeaturizer(
loader = DataLoader(
tasks=tasks,
smiles_field=self.smiles_field,
featurizer=featurizer,
Expand Down
2 changes: 1 addition & 1 deletion deepchem/datasets/tests/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import shutil
import numpy as np
from deepchem.datasets import Dataset
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.featurizers.featurize import DataLoader
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.transformers import NormalizationTransformer
from deepchem.datasets.tests import TestDatasetAPI
Expand Down
10 changes: 5 additions & 5 deletions deepchem/datasets/tests/test_drop.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np
from deepchem.models.tests import TestAPI
from deepchem.utils.save import load_from_disk
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.featurizers.featurize import DataLoader
from deepchem.datasets import Dataset
from sklearn.ensemble import RandomForestClassifier
from deepchem.models.sklearn_models import SklearnModel
Expand Down Expand Up @@ -37,10 +37,10 @@ def test_drop(self):
featurizer = CircularFingerprint(size=1024)
emols_tasks = ['activity']

loader = DataFeaturizer(tasks=emols_tasks,
smiles_field="smiles",
featurizer=featurizer,
verbosity=verbosity)
loader = DataLoader(tasks=emols_tasks,
smiles_field="smiles",
featurizer=featurizer,
verbosity=verbosity)
dataset = loader.featurize(dataset_file, data_dir, debug=True, logging=False)

X, y, w, ids = dataset.to_numpy()
Expand Down
34 changes: 17 additions & 17 deletions deepchem/datasets/tests/test_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from deepchem.models.tests import TestAPI
from deepchem.utils.save import load_from_disk
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.featurizers.featurize import DataLoader
from deepchem.datasets import Dataset

## task0: 1,1,0,-,0,-,1,-,-,1
Expand All @@ -37,10 +37,10 @@ def test_move_load(self):

featurizer = CircularFingerprint(size=1024)
tasks = ["log-solubility"]
loader = DataFeaturizer(tasks=tasks,
smiles_field="smiles",
featurizer=featurizer,
verbosity=verbosity)
loader = DataLoader(tasks=tasks,
smiles_field="smiles",
featurizer=featurizer,
verbosity=verbosity)
dataset = loader.featurize(dataset_file, data_dir)

X, y, w, ids = dataset.to_numpy()
Expand Down Expand Up @@ -92,10 +92,10 @@ def test_multiload(self):
all_tasks = ["task%d"%i for i in range(17)]

####### Do featurization
loader = DataFeaturizer(tasks=all_tasks,
smiles_field="smiles",
featurizer=featurizer,
verbosity=verbosity)
loader = DataLoader(tasks=all_tasks,
smiles_field="smiles",
featurizer=featurizer,
verbosity=verbosity)
dataset = loader.featurize(
dataset_file, data_dir)

Expand Down Expand Up @@ -160,10 +160,10 @@ def test_singletask_matches_multitask_load(self):
tasks = all_tasks[0:n_tasks]

####### Do multitask load
loader = DataFeaturizer(tasks=tasks,
smiles_field="smiles",
featurizer=featurizer,
verbosity=verbosity)
loader = DataLoader(tasks=tasks,
smiles_field="smiles",
featurizer=featurizer,
verbosity=verbosity)
dataset = loader.featurize(dataset_file, data_dir)

# Do train/valid split.
Expand All @@ -176,10 +176,10 @@ def test_singletask_matches_multitask_load(self):
print("Processing task %s" % task)
if os.path.exists(data_dir):
shutil.rmtree(data_dir)
loader = DataFeaturizer(tasks=[task],
smiles_field="smiles",
featurizer=featurizer,
verbosity=verbosity)
loader = DataLoader(tasks=[task],
smiles_field="smiles",
featurizer=featurizer,
verbosity=verbosity)
dataset = loader.featurize(dataset_file, data_dir)

X_task, y_task, w_task, ids_task = dataset.to_numpy()
Expand Down
24 changes: 12 additions & 12 deletions deepchem/datasets/tests/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from deepchem.models.tests import TestAPI
from deepchem.utils.save import load_from_disk
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.featurizers.featurize import DataLoader
from deepchem.datasets import Dataset

class TestMerge(TestAPI):
Expand All @@ -36,13 +36,13 @@ def test_merge(self):

featurizer = CircularFingerprint(size=1024)
tasks = ["log-solubility"]
featurizer = DataFeaturizer(tasks=tasks,
smiles_field="smiles",
featurizer=featurizer,
verbosity=verbosity)
first_dataset = featurizer.featurize(
loader = DataLoader(tasks=tasks,
smiles_field="smiles",
featurizer=featurizer,
verbosity=verbosity)
first_dataset = loader.featurize(
dataset_file, first_data_dir)
second_dataset = featurizer.featurize(
second_dataset = loader.featurize(
dataset_file, second_data_dir)

merged_dataset = Dataset.merge(
Expand All @@ -62,11 +62,11 @@ def test_subset(self):

featurizer = CircularFingerprint(size=1024)
tasks = ["log-solubility"]
featurizer = DataFeaturizer(tasks=tasks,
smiles_field="smiles",
featurizer=featurizer,
verbosity=verbosity)
dataset = featurizer.featurize(
loader = DataLoader(tasks=tasks,
smiles_field="smiles",
featurizer=featurizer,
verbosity=verbosity)
dataset = loader.featurize(
dataset_file, data_dir, shard_size=2)

shard_nums = [1, 2]
Expand Down
10 changes: 5 additions & 5 deletions deepchem/datasets/tests/test_reload.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from deepchem.models.tests import TestAPI
from deepchem.utils.save import load_from_disk
from deepchem.datasets import Dataset
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.featurizers.featurize import DataLoader
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.splits import ScaffoldSplitter
from deepchem.datasets import Dataset
Expand Down Expand Up @@ -45,10 +45,10 @@ def _run_muv_experiment(self, dataset_file, reload=False, verbosity=None):
'MUV-548', 'MUV-852', 'MUV-600', 'MUV-810', 'MUV-712',
'MUV-737', 'MUV-858', 'MUV-713', 'MUV-733', 'MUV-652',
'MUV-466', 'MUV-832']
loader = DataFeaturizer(tasks=MUV_tasks,
smiles_field="smiles",
featurizer=featurizer,
verbosity=verbosity)
loader = DataLoader(tasks=MUV_tasks,
smiles_field="smiles",
featurizer=featurizer,
verbosity=verbosity)
dataset = loader.featurize(dataset_file, self.data_dir)
assert len(dataset) == len(raw_dataset)

Expand Down
10 changes: 5 additions & 5 deletions deepchem/datasets/tests/test_shuffle.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from deepchem.models.tests import TestAPI
from deepchem.utils.save import load_from_disk
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.featurizers.featurize import DataLoader
from deepchem.datasets import Dataset

class TestShuffle(TestAPI):
Expand All @@ -34,10 +34,10 @@ def test_shuffle(self):

featurizer = CircularFingerprint(size=1024)
tasks = ["log-solubility"]
loader = DataFeaturizer(tasks=tasks,
smiles_field="smiles",
featurizer=featurizer,
verbosity=verbosity)
loader = DataLoader(tasks=tasks,
smiles_field="smiles",
featurizer=featurizer,
verbosity=verbosity)
dataset = loader.featurize(
dataset_file, data_dir, shard_size=2)

Expand Down
10 changes: 5 additions & 5 deletions deepchem/datasets/tox21_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from sklearn.ensemble import RandomForestClassifier
from deepchem.utils.save import load_from_disk
from deepchem.datasets import Dataset
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.featurizers.featurize import DataLoader
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.splits import ScaffoldSplitter
from deepchem.splits import RandomSplitter
Expand Down Expand Up @@ -62,10 +62,10 @@ def load_tox21(base_dir, reload=True):
'SR-HSE', 'SR-MMP', 'SR-p53']

if not reload or not os.path.exists(data_dir):
loader = DataFeaturizer(tasks=all_tox21_tasks,
smiles_field="smiles",
featurizer=featurizer,
verbosity=verbosity)
loader = DataLoader(tasks=all_tox21_tasks,
smiles_field="smiles",
featurizer=featurizer,
verbosity=verbosity)
dataset = loader.featurize(
dataset_file, data_dir, shard_size=8192)
else:
Expand Down
2 changes: 1 addition & 1 deletion deepchem/featurizers/featurize.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def featurize_map_function(args):
# else:
# raise ValueError("Field of unrecognized type: %s" % str(val))

class DataFeaturizer(object):
class DataLoader(object):
"""
Handles loading/featurizing of chemical samples (datapoints).
Expand Down
14 changes: 7 additions & 7 deletions deepchem/featurizers/tests/test_data_featurizer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
Tests for DataFeaturizer class
Tests for DataLoader class
"""
from __future__ import print_function
from __future__ import division
Expand All @@ -14,10 +14,10 @@
import tempfile
import shutil
from deepchem.models.tests import TestAPI
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.featurizers.featurize import DataLoader
from deepchem.featurizers.fingerprints import CircularFingerprint

class TestDataFeaturizer(TestAPI):
class TestDataLoader(TestAPI):
"""
Test Data Featurizer class.
"""
Expand All @@ -29,10 +29,10 @@ def test_log_solubility_dataset(self):

tasks = ["log-solubility"]
smiles_field = "smiles"
loader = DataFeaturizer(tasks=tasks,
smiles_field=self.smiles_field,
featurizer=CircularFingerprint(size=1024),
verbosity="low")
loader = DataLoader(tasks=tasks,
smiles_field=self.smiles_field,
featurizer=CircularFingerprint(size=1024),
verbosity="low")
dataset = loader.featurize(input_file, self.data_dir)

assert len(dataset) == 10
Loading

0 comments on commit 4018ef4

Please sign in to comment.