Skip to content

Commit

Permalink
adding in chapter 6 stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
tomzaragoza committed Jan 14, 2014
1 parent f61ab64 commit d0e9ba4
Show file tree
Hide file tree
Showing 11 changed files with 8,093 additions and 0 deletions.
131 changes: 131 additions & 0 deletions ch06/01_start.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
# This code is supporting material for the book
# Building Machine Learning Systems with Python
# by Willi Richert and Luis Pedro Coelho
# published by PACKT Publishing
#
# It is made available under the MIT License

#
# This script trains multinomial Naive Bayes on the tweet corpus
# to find two different results:
# - How well can we distinguis positive from negative tweets?
# - How well can we detect whether a tweet contains sentiment at all?
#

import time
start_time = time.time()

import numpy as np

from sklearn.metrics import precision_recall_curve, roc_curve, auc
from sklearn.cross_validation import ShuffleSplit

from utils import plot_pr
from utils import load_sanders_data
from utils import tweak_labels

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB


def create_ngram_model():
tfidf_ngrams = TfidfVectorizer(ngram_range=(1, 3),
analyzer="word", binary=False)
clf = MultinomialNB()
pipeline = Pipeline([('vect', tfidf_ngrams), ('clf', clf)])
return pipeline


def train_model(clf_factory, X, Y, name="NB ngram", plot=False):
cv = ShuffleSplit(
n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)

train_errors = []
test_errors = []

scores = []
pr_scores = []
precisions, recalls, thresholds = [], [], []

for train, test in cv:
X_train, y_train = X[train], Y[train]
X_test, y_test = X[test], Y[test]

clf = clf_factory()
clf.fit(X_train, y_train)

train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)

train_errors.append(1 - train_score)
test_errors.append(1 - test_score)

scores.append(test_score)
proba = clf.predict_proba(X_test)

fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
precision, recall, pr_thresholds = precision_recall_curve(
y_test, proba[:, 1])

pr_scores.append(auc(recall, precision))
precisions.append(precision)
recalls.append(recall)
thresholds.append(pr_thresholds)

scores_to_sort = pr_scores
median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

if plot:
plot_pr(pr_scores[median], name, "01", precisions[median],
recalls[median], label=name)

summary = (np.mean(scores), np.std(scores),
np.mean(pr_scores), np.std(pr_scores))
print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary

return np.mean(train_errors), np.mean(test_errors)


def print_incorrect(clf, X, Y):
Y_hat = clf.predict(X)
wrong_idx = Y_hat != Y
X_wrong = X[wrong_idx]
Y_wrong = Y[wrong_idx]
Y_hat_wrong = Y_hat[wrong_idx]
for idx in xrange(len(X_wrong)):
print "clf.predict('%s')=%i instead of %i" %\
(X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])


if __name__ == "__main__":
X_orig, Y_orig = load_sanders_data()
classes = np.unique(Y_orig)
for c in classes:
print "#%s: %i" % (c, sum(Y_orig == c))

print "== Pos vs. neg =="
pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative")
X = X_orig[pos_neg]
Y = Y_orig[pos_neg]
Y = tweak_labels(Y, ["positive"])

train_model(create_ngram_model, X, Y, name="pos vs neg", plot=True)

print "== Pos/neg vs. irrelevant/neutral =="
X = X_orig
Y = tweak_labels(Y_orig, ["positive", "negative"])
train_model(create_ngram_model, X, Y, name="sent vs rest", plot=True)

print "== Pos vs. rest =="
X = X_orig
Y = tweak_labels(Y_orig, ["positive"])
train_model(create_ngram_model, X, Y, name="pos vs rest", plot=True)

print "== Neg vs. rest =="
X = X_orig
Y = tweak_labels(Y_orig, ["negative"])
train_model(create_ngram_model, X, Y, name="neg vs rest", plot=True)

print "time spent:", time.time() - start_time
181 changes: 181 additions & 0 deletions ch06/02_tuning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
# This code is supporting material for the book
# Building Machine Learning Systems with Python
# by Willi Richert and Luis Pedro Coelho
# published by PACKT Publishing
#
# It is made available under the MIT License

#
# This script trains tries to tweak hyperparameters to improve P/R AUC
#

import time
start_time = time.time()

import numpy as np

from sklearn.metrics import precision_recall_curve, roc_curve, auc
from sklearn.cross_validation import ShuffleSplit

from utils import plot_pr
from utils import load_sanders_data
from utils import tweak_labels

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import f1_score

from sklearn.naive_bayes import MultinomialNB

phase = "02"


def create_ngram_model(params=None):
tfidf_ngrams = TfidfVectorizer(ngram_range=(1, 3),
analyzer="word", binary=False)
clf = MultinomialNB()
pipeline = Pipeline([('vect', tfidf_ngrams), ('clf', clf)])

if params:
pipeline.set_params(**params)

return pipeline


def grid_search_model(clf_factory, X, Y):
cv = ShuffleSplit(
n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)

param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)],
vect__min_df=[1, 2],
vect__stop_words=[None, "english"],
vect__smooth_idf=[False, True],
vect__use_idf=[False, True],
vect__sublinear_tf=[False, True],
vect__binary=[False, True],
clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1],
)

grid_search = GridSearchCV(clf_factory(),
param_grid=param_grid,
cv=cv,
score_func=f1_score,
verbose=10)
grid_search.fit(X, Y)
clf = grid_search.best_estimator_
print clf

return clf


def train_model(clf, X, Y, name="NB ngram", plot=False):
# create it again for plotting
cv = ShuffleSplit(
n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)

train_errors = []
test_errors = []

scores = []
pr_scores = []
precisions, recalls, thresholds = [], [], []

for train, test in cv:
X_train, y_train = X[train], Y[train]
X_test, y_test = X[test], Y[test]

clf.fit(X_train, y_train)

train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)

train_errors.append(1 - train_score)
test_errors.append(1 - test_score)

scores.append(test_score)
proba = clf.predict_proba(X_test)

fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
precision, recall, pr_thresholds = precision_recall_curve(
y_test, proba[:, 1])

pr_scores.append(auc(recall, precision))
precisions.append(precision)
recalls.append(recall)
thresholds.append(pr_thresholds)

if plot:
scores_to_sort = pr_scores
median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

plot_pr(pr_scores[median], name, phase, precisions[median],
recalls[median], label=name)

summary = (np.mean(scores), np.std(scores),
np.mean(pr_scores), np.std(pr_scores))
print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary

return np.mean(train_errors), np.mean(test_errors)


def print_incorrect(clf, X, Y):
Y_hat = clf.predict(X)
wrong_idx = Y_hat != Y
X_wrong = X[wrong_idx]
Y_wrong = Y[wrong_idx]
Y_hat_wrong = Y_hat[wrong_idx]
for idx in xrange(len(X_wrong)):
print "clf.predict('%s')=%i instead of %i" %\
(X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])


def get_best_model():
best_params = dict(vect__ngram_range=(1, 2),
vect__min_df=1,
vect__stop_words=None,
vect__smooth_idf=False,
vect__use_idf=False,
vect__sublinear_tf=True,
vect__binary=False,
clf__alpha=0.01,
)

best_clf = create_ngram_model(best_params)

return best_clf

if __name__ == "__main__":
X_orig, Y_orig = load_sanders_data()
classes = np.unique(Y_orig)
for c in classes:
print "#%s: %i" % (c, sum(Y_orig == c))

print "== Pos vs. neg =="
pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative")
X = X_orig[pos_neg]
Y = Y_orig[pos_neg]
Y = tweak_labels(Y, ["positive"])
train_model(get_best_model(), X, Y, name="pos vs neg", plot=True)

print "== Pos/neg vs. irrelevant/neutral =="
X = X_orig
Y = tweak_labels(Y_orig, ["positive", "negative"])

# best_clf = grid_search_model(create_ngram_model, X, Y, name="sent vs
# rest", plot=True)
train_model(get_best_model(), X, Y, name="pos vs neg", plot=True)

print "== Pos vs. rest =="
X = X_orig
Y = tweak_labels(Y_orig, ["positive"])
train_model(get_best_model(), X, Y, name="pos vs rest",
plot=True)

print "== Neg vs. rest =="
X = X_orig
Y = tweak_labels(Y_orig, ["negative"])
train_model(get_best_model(), X, Y, name="neg vs rest",
plot=True)

print "time spent:", time.time() - start_time
Loading

0 comments on commit d0e9ba4

Please sign in to comment.