Is RandomForestClassifier oob posteriors also biased without stratification? #287
Open
Description
We can test:
def test_random_forest_posteriors_on_independent():
"""Test regression from :gh:`283`.
Posteriors were biased when the classes were independent and using the bootstrap and oob sample
technique to estimate the final population test statistic. This resulted in a biased estimate
of the AUC score. Stratification of the bootstrapping samples was the solution to this problem.
"""
from sktree import RandomForestClassifier
from sklearn.datasets import make_classification
scores = []
for idx in range(5):
# create a dataset with overlapping classes
X = np.random.standard_normal(size=(128, 4096))
y = np.vstack([np.zeros(64), np.ones(64)]).ravel()
y = y.reshape(-1, 1)
clf = RandomForestClassifier(
n_estimators=100,
random_state=idx,
bootstrap=True,
max_samples=1.0,
n_jobs=-1,
# stratify=True,
)
clf.fit(X, y)
oob_posteriors = np.empty((len(clf.estimators_), X.shape[0], 2))
for idx, (tree, inbag_idx) in enumerate(zip(clf.estimators_, clf.estimators_samples_)):
oob_idx = np.array([idx for idx in range(X.shape[0]) if idx not in inbag_idx])
oob_posteriors[idx, oob_idx, :] = tree.predict_proba(X[oob_idx])
auc_score = roc_auc_score(y, np.nanmean(oob_posteriors, axis=0)[:, 1])
scores.append(auc_score)
# Without stratification, this test should fail
print(np.mean(scores), scores)
assert np.mean(scores) > 0.49 and np.mean(scores) < 0.51, f"{np.mean(scores)} {scores}"
assert False
Metadata
Assignees
Labels
No labels
Activity