Skip to content

Commit

Permalink
add SDAE, supervised denoising autoencoder
Browse files Browse the repository at this point in the history
  • Loading branch information
jeongyoonlee committed Jun 1, 2021
1 parent e0fc66d commit 5fa3de1
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 8 deletions.
4 changes: 2 additions & 2 deletions kaggler/preprocessing/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .autoencoder import DAE
from .autoencoder import DAE, SDAE
from .categorical import OneHotEncoder, LabelEncoder, TargetEncoder, EmbeddingEncoder, FrequencyEncoder
from .numerical import Normalizer, QuantileEncoder

__all__ = ['DAE', 'OneHotEncoder', 'LabelEncoder', 'TargetEncoder', 'EmbeddingEncoder',
__all__ = ['DAE', 'SDAE', 'OneHotEncoder', 'LabelEncoder', 'TargetEncoder', 'EmbeddingEncoder',
'Normalizer', 'QuantileEncoder', 'FrequencyEncoder']
90 changes: 85 additions & 5 deletions kaggler/preprocessing/autoencoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def __init__(self, cat_cols=[], num_cols=[], n_emb=[], encoding_dim=128, n_layer
self.seed = random_state
self.lbe = LabelEncoder(min_obs=min_obs)

def build_model(self, X):
def build_model(self, X, y=None):
inputs = []
num_inputs = []
embeddings = []
Expand Down Expand Up @@ -244,8 +244,8 @@ def build_model(self, X):
encoded, decoded = dae_layers[i](merged_inputs)
_, merged_inputs = dae_layers[i](merged_inputs, training=False)

self.encoder = Model(inputs=inputs, outputs=encoded)
self.dae = Model(inputs=inputs, outputs=decoded)
self.encoder = Model(inputs=inputs, outputs=encoded, name='encoder_head')
self.dae = Model(inputs=inputs, outputs=decoded, name='decoder_head')
self.dae.compile(optimizer='adam')

def fit(self, X, y=None):
Expand All @@ -256,12 +256,12 @@ def fit(self, X, y=None):
y (pandas.Series, optional): not used
Returns:
A trained AutoEncoder object.
A trained DAE object.
"""
if self.cat_cols:
X[self.cat_cols] = self.lbe.fit_transform(X[self.cat_cols])

self.build_model(X)
self.build_model(X, y)

features = [X[col].values for col in self.cat_cols]
if self.num_cols:
Expand Down Expand Up @@ -306,3 +306,83 @@ def fit_transform(self, X, y=None):
"""
self.fit(X, y)
return self.transform(X)


class SDAE(DAE):
"""Supervised Denoising AutoEncoder feature transformer."""

def build_model(self, X, y=None):
inputs = []
num_inputs = []
embeddings = []

if self.cat_cols:
for i, col in enumerate(self.cat_cols):
n_uniq = X[col].nunique()
if not self.n_emb[i]:
self.n_emb[i] = max(MIN_EMBEDDING, 2 * int(np.log2(n_uniq)))

inp = Input(shape=(1,), name=col)
emb = Embedding(input_dim=n_uniq, output_dim=self.n_emb[i], name=col + EMBEDDING_SUFFIX)(inp)
emb = Dropout(self.dropout)(emb)
emb = Reshape((self.n_emb[i],))(emb)

inputs.append(inp)
embeddings.append(emb)

if self.num_cols:
num_inputs = Input(shape=(len(self.num_cols),), name='num_inputs')
merged_inputs = Concatenate()(embeddings + [num_inputs]) if embeddings else num_inputs

inputs = inputs + [num_inputs]
else:
merged_inputs = Concatenate()(embeddings)

dae_layers = []
for i in range(self.n_layer):
dae_layers.append(DAELayer(encoding_dim=self.encoding_dim, noise_std=self.noise_std,
swap_prob=self.swap_prob, mask_prob=self.mask_prob,
random_state=self.seed, name=f'dae_layer_{i}'))

encoded, decoded = dae_layers[i](merged_inputs)
_, merged_inputs = dae_layers[i](merged_inputs, training=False)

self.encoder = Model(inputs=inputs, outputs=encoded, name='encoder_head')

if y.dtype in [np.int32, np.int64]:
n_uniq = len(np.unique(y))
if n_uniq == 2:
self.n_class = 1
self.output_activation = 'binary_crossentropy'
self.output_loss = 'binary_crossentropy'
elif n_uniq > 2:
self.n_class = n_uniq
self.output_activation = 'sparse_categorical_crossentropy'
self.output_loss = 'sparse_categorical_crossentropy'
else:
self.n_class = 1
self.output_activation = 'linear'
self.output_loss = 'mean_squared_error'

# supervised head
supervised_inputs = Input((self.encoding_dim,), name='supervised_inputs')
x = Dense(1024, 'relu')(supervised_inputs)
x = Dropout(.3, seed=self.seed)(x)
supervised_outputs = Dense(self.n_class, activation=self.output_activation)(x)
self.supervised = Model(inputs=supervised_inputs, outputs=supervised_outputs, name='supervised_head')

self.dae = Model(inputs=inputs, outputs=self.supervised(self.encoder(inputs)), name='decoder_head')
self.dae.compile(optimizer='adam', loss=self.output_loss)

def fit(self, X, y):
"""Train supervised DAE
Args:
X (pandas.DataFrame): features to encode
y (pandas.Series): target variable
Returns:
A trained SDAE object.
"""
assert y is not None, 'SDAE needs y (target variable) for fit()'
super().fit(X, y)
15 changes: 14 additions & 1 deletion tests/test_encoders.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from kaggler.preprocessing import DAE, TargetEncoder, EmbeddingEncoder, FrequencyEncoder
from kaggler.preprocessing import DAE, SDAE, TargetEncoder, EmbeddingEncoder, FrequencyEncoder
from sklearn.model_selection import KFold

from .const import RANDOM_SEED, TARGET_COL
Expand All @@ -20,6 +20,19 @@ def test_DAE(generate_data):
assert X.shape[1] == encoding_dim


def test_SDAE(generate_data):
encoding_dim = 10

df = generate_data()
feature_cols = [x for x in df.columns if x != TARGET_COL]
cat_cols = [x for x in feature_cols if df[x].nunique() < 100]
num_cols = [x for x in feature_cols if x not in cat_cols]

dae = SDAE(cat_cols=cat_cols, num_cols=num_cols, encoding_dim=encoding_dim, random_state=RANDOM_SEED)
X = dae.fit_transform(df[feature_cols], df[TARGET_COL])
assert X.shape[1] == encoding_dim


def test_TargetEncoder(generate_data):
df = generate_data()
feature_cols = [x for x in df.columns if x != TARGET_COL]
Expand Down

0 comments on commit 5fa3de1

Please sign in to comment.