forked from tomzaragoza/learning-ml-python
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
f61ab64
commit d0e9ba4
Showing
11 changed files
with
8,093 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
# This code is supporting material for the book | ||
# Building Machine Learning Systems with Python | ||
# by Willi Richert and Luis Pedro Coelho | ||
# published by PACKT Publishing | ||
# | ||
# It is made available under the MIT License | ||
|
||
# | ||
# This script trains multinomial Naive Bayes on the tweet corpus | ||
# to find two different results: | ||
# - How well can we distinguis positive from negative tweets? | ||
# - How well can we detect whether a tweet contains sentiment at all? | ||
# | ||
|
||
import time | ||
start_time = time.time() | ||
|
||
import numpy as np | ||
|
||
from sklearn.metrics import precision_recall_curve, roc_curve, auc | ||
from sklearn.cross_validation import ShuffleSplit | ||
|
||
from utils import plot_pr | ||
from utils import load_sanders_data | ||
from utils import tweak_labels | ||
|
||
from sklearn.feature_extraction.text import TfidfVectorizer | ||
from sklearn.pipeline import Pipeline | ||
|
||
from sklearn.naive_bayes import MultinomialNB | ||
|
||
|
||
def create_ngram_model(): | ||
tfidf_ngrams = TfidfVectorizer(ngram_range=(1, 3), | ||
analyzer="word", binary=False) | ||
clf = MultinomialNB() | ||
pipeline = Pipeline([('vect', tfidf_ngrams), ('clf', clf)]) | ||
return pipeline | ||
|
||
|
||
def train_model(clf_factory, X, Y, name="NB ngram", plot=False): | ||
cv = ShuffleSplit( | ||
n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) | ||
|
||
train_errors = [] | ||
test_errors = [] | ||
|
||
scores = [] | ||
pr_scores = [] | ||
precisions, recalls, thresholds = [], [], [] | ||
|
||
for train, test in cv: | ||
X_train, y_train = X[train], Y[train] | ||
X_test, y_test = X[test], Y[test] | ||
|
||
clf = clf_factory() | ||
clf.fit(X_train, y_train) | ||
|
||
train_score = clf.score(X_train, y_train) | ||
test_score = clf.score(X_test, y_test) | ||
|
||
train_errors.append(1 - train_score) | ||
test_errors.append(1 - test_score) | ||
|
||
scores.append(test_score) | ||
proba = clf.predict_proba(X_test) | ||
|
||
fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1]) | ||
precision, recall, pr_thresholds = precision_recall_curve( | ||
y_test, proba[:, 1]) | ||
|
||
pr_scores.append(auc(recall, precision)) | ||
precisions.append(precision) | ||
recalls.append(recall) | ||
thresholds.append(pr_thresholds) | ||
|
||
scores_to_sort = pr_scores | ||
median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] | ||
|
||
if plot: | ||
plot_pr(pr_scores[median], name, "01", precisions[median], | ||
recalls[median], label=name) | ||
|
||
summary = (np.mean(scores), np.std(scores), | ||
np.mean(pr_scores), np.std(pr_scores)) | ||
print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary | ||
|
||
return np.mean(train_errors), np.mean(test_errors) | ||
|
||
|
||
def print_incorrect(clf, X, Y): | ||
Y_hat = clf.predict(X) | ||
wrong_idx = Y_hat != Y | ||
X_wrong = X[wrong_idx] | ||
Y_wrong = Y[wrong_idx] | ||
Y_hat_wrong = Y_hat[wrong_idx] | ||
for idx in xrange(len(X_wrong)): | ||
print "clf.predict('%s')=%i instead of %i" %\ | ||
(X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx]) | ||
|
||
|
||
if __name__ == "__main__": | ||
X_orig, Y_orig = load_sanders_data() | ||
classes = np.unique(Y_orig) | ||
for c in classes: | ||
print "#%s: %i" % (c, sum(Y_orig == c)) | ||
|
||
print "== Pos vs. neg ==" | ||
pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative") | ||
X = X_orig[pos_neg] | ||
Y = Y_orig[pos_neg] | ||
Y = tweak_labels(Y, ["positive"]) | ||
|
||
train_model(create_ngram_model, X, Y, name="pos vs neg", plot=True) | ||
|
||
print "== Pos/neg vs. irrelevant/neutral ==" | ||
X = X_orig | ||
Y = tweak_labels(Y_orig, ["positive", "negative"]) | ||
train_model(create_ngram_model, X, Y, name="sent vs rest", plot=True) | ||
|
||
print "== Pos vs. rest ==" | ||
X = X_orig | ||
Y = tweak_labels(Y_orig, ["positive"]) | ||
train_model(create_ngram_model, X, Y, name="pos vs rest", plot=True) | ||
|
||
print "== Neg vs. rest ==" | ||
X = X_orig | ||
Y = tweak_labels(Y_orig, ["negative"]) | ||
train_model(create_ngram_model, X, Y, name="neg vs rest", plot=True) | ||
|
||
print "time spent:", time.time() - start_time |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,181 @@ | ||
# This code is supporting material for the book | ||
# Building Machine Learning Systems with Python | ||
# by Willi Richert and Luis Pedro Coelho | ||
# published by PACKT Publishing | ||
# | ||
# It is made available under the MIT License | ||
|
||
# | ||
# This script trains tries to tweak hyperparameters to improve P/R AUC | ||
# | ||
|
||
import time | ||
start_time = time.time() | ||
|
||
import numpy as np | ||
|
||
from sklearn.metrics import precision_recall_curve, roc_curve, auc | ||
from sklearn.cross_validation import ShuffleSplit | ||
|
||
from utils import plot_pr | ||
from utils import load_sanders_data | ||
from utils import tweak_labels | ||
|
||
from sklearn.feature_extraction.text import TfidfVectorizer | ||
from sklearn.pipeline import Pipeline | ||
from sklearn.grid_search import GridSearchCV | ||
from sklearn.metrics import f1_score | ||
|
||
from sklearn.naive_bayes import MultinomialNB | ||
|
||
phase = "02" | ||
|
||
|
||
def create_ngram_model(params=None): | ||
tfidf_ngrams = TfidfVectorizer(ngram_range=(1, 3), | ||
analyzer="word", binary=False) | ||
clf = MultinomialNB() | ||
pipeline = Pipeline([('vect', tfidf_ngrams), ('clf', clf)]) | ||
|
||
if params: | ||
pipeline.set_params(**params) | ||
|
||
return pipeline | ||
|
||
|
||
def grid_search_model(clf_factory, X, Y): | ||
cv = ShuffleSplit( | ||
n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) | ||
|
||
param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)], | ||
vect__min_df=[1, 2], | ||
vect__stop_words=[None, "english"], | ||
vect__smooth_idf=[False, True], | ||
vect__use_idf=[False, True], | ||
vect__sublinear_tf=[False, True], | ||
vect__binary=[False, True], | ||
clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1], | ||
) | ||
|
||
grid_search = GridSearchCV(clf_factory(), | ||
param_grid=param_grid, | ||
cv=cv, | ||
score_func=f1_score, | ||
verbose=10) | ||
grid_search.fit(X, Y) | ||
clf = grid_search.best_estimator_ | ||
print clf | ||
|
||
return clf | ||
|
||
|
||
def train_model(clf, X, Y, name="NB ngram", plot=False): | ||
# create it again for plotting | ||
cv = ShuffleSplit( | ||
n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) | ||
|
||
train_errors = [] | ||
test_errors = [] | ||
|
||
scores = [] | ||
pr_scores = [] | ||
precisions, recalls, thresholds = [], [], [] | ||
|
||
for train, test in cv: | ||
X_train, y_train = X[train], Y[train] | ||
X_test, y_test = X[test], Y[test] | ||
|
||
clf.fit(X_train, y_train) | ||
|
||
train_score = clf.score(X_train, y_train) | ||
test_score = clf.score(X_test, y_test) | ||
|
||
train_errors.append(1 - train_score) | ||
test_errors.append(1 - test_score) | ||
|
||
scores.append(test_score) | ||
proba = clf.predict_proba(X_test) | ||
|
||
fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1]) | ||
precision, recall, pr_thresholds = precision_recall_curve( | ||
y_test, proba[:, 1]) | ||
|
||
pr_scores.append(auc(recall, precision)) | ||
precisions.append(precision) | ||
recalls.append(recall) | ||
thresholds.append(pr_thresholds) | ||
|
||
if plot: | ||
scores_to_sort = pr_scores | ||
median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] | ||
|
||
plot_pr(pr_scores[median], name, phase, precisions[median], | ||
recalls[median], label=name) | ||
|
||
summary = (np.mean(scores), np.std(scores), | ||
np.mean(pr_scores), np.std(pr_scores)) | ||
print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary | ||
|
||
return np.mean(train_errors), np.mean(test_errors) | ||
|
||
|
||
def print_incorrect(clf, X, Y): | ||
Y_hat = clf.predict(X) | ||
wrong_idx = Y_hat != Y | ||
X_wrong = X[wrong_idx] | ||
Y_wrong = Y[wrong_idx] | ||
Y_hat_wrong = Y_hat[wrong_idx] | ||
for idx in xrange(len(X_wrong)): | ||
print "clf.predict('%s')=%i instead of %i" %\ | ||
(X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx]) | ||
|
||
|
||
def get_best_model(): | ||
best_params = dict(vect__ngram_range=(1, 2), | ||
vect__min_df=1, | ||
vect__stop_words=None, | ||
vect__smooth_idf=False, | ||
vect__use_idf=False, | ||
vect__sublinear_tf=True, | ||
vect__binary=False, | ||
clf__alpha=0.01, | ||
) | ||
|
||
best_clf = create_ngram_model(best_params) | ||
|
||
return best_clf | ||
|
||
if __name__ == "__main__": | ||
X_orig, Y_orig = load_sanders_data() | ||
classes = np.unique(Y_orig) | ||
for c in classes: | ||
print "#%s: %i" % (c, sum(Y_orig == c)) | ||
|
||
print "== Pos vs. neg ==" | ||
pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative") | ||
X = X_orig[pos_neg] | ||
Y = Y_orig[pos_neg] | ||
Y = tweak_labels(Y, ["positive"]) | ||
train_model(get_best_model(), X, Y, name="pos vs neg", plot=True) | ||
|
||
print "== Pos/neg vs. irrelevant/neutral ==" | ||
X = X_orig | ||
Y = tweak_labels(Y_orig, ["positive", "negative"]) | ||
|
||
# best_clf = grid_search_model(create_ngram_model, X, Y, name="sent vs | ||
# rest", plot=True) | ||
train_model(get_best_model(), X, Y, name="pos vs neg", plot=True) | ||
|
||
print "== Pos vs. rest ==" | ||
X = X_orig | ||
Y = tweak_labels(Y_orig, ["positive"]) | ||
train_model(get_best_model(), X, Y, name="pos vs rest", | ||
plot=True) | ||
|
||
print "== Neg vs. rest ==" | ||
X = X_orig | ||
Y = tweak_labels(Y_orig, ["negative"]) | ||
train_model(get_best_model(), X, Y, name="neg vs rest", | ||
plot=True) | ||
|
||
print "time spent:", time.time() - start_time |
Oops, something went wrong.