forked from zzqboy/static_study
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
zzq2015
committed
May 31, 2017
1 parent
2fa4e30
commit 23760c7
Showing
28 changed files
with
1,460 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
# -*- coding:utf-8 -*- | ||
""" | ||
author: acrafter | ||
date: 17-4-19 | ||
贝叶斯分类器,这里没有加平滑 | ||
""" | ||
from __future__ import division | ||
|
||
from itertools import izip | ||
from collections import Counter | ||
|
||
|
||
class Bayes(object): | ||
def __init__(self, x, y): | ||
self.x = x | ||
self.y = y | ||
self.class_counter = Counter() | ||
self.pre_proba_counter = Counter() | ||
self.class_proba = {} | ||
self.pre_proba = {} | ||
|
||
def fit(self): | ||
for y, x in izip(self.y, self.x): | ||
self.class_counter[y] += 1 | ||
for x_l in x: | ||
self.pre_proba_counter[(y, x_l)] += 1 | ||
self.class_proba = {y: self.class_counter[y]/len(self.y) for y in self.class_counter} | ||
self.pre_proba.update({y: self.pre_proba_counter[y]/self.class_counter[y[0]] for y in | ||
self.pre_proba_counter}) | ||
|
||
def predict(self, x): | ||
be_proba = {} | ||
for y in self.class_proba: | ||
y_proba = self.class_proba[y] | ||
for x_l in x: | ||
y_proba *= self.pre_proba[(y, x_l)] | ||
be_proba[y] = y_proba | ||
for y_class, be_proba in sorted(be_proba.iteritems(), key=lambda x: x[1], reverse=True): | ||
print y_class, be_proba | ||
|
||
|
||
if __name__ == '__main__': | ||
data = [[1, 's'], [1, 'M'], [1, 'M'], [1, 's'], [1, 's'], [2, 's'], [2, 'm'], [2, 'm'], | ||
[2, 'l'], [2, 'l'], [3, 'l'], [3, 'm'], [3, 'm'], [3, 'l'], [3, 'l']] | ||
label = [-1, -1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, -1] | ||
b = Bayes(data, label) | ||
b.fit() | ||
b.predict([2, 's']) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,38 @@ | ||
# static_study | ||
这里是《统计学习方法》的读书笔记,尝试自己实现里面的轮子。其实任何自己实现的轮子并不一定具备实际应用的能力,目的是通过代码来学习算法! | ||
# 《统计学习方法》的代码实现。 | ||
这里是《统计学习方法》的读书笔记,尝试自己实现里面的轮子。其实任何自己实现的轮子并不一定具备实际应用的能力,目的是通过代码来学习算法!:smile: | ||
不全是自己写,参考均有说明!O(∩_∩)O | ||
# 笔记与代码 | ||
## 感知机 | ||
代码: https://github.com/acrafter/static_study/blob/master/perceptron.py | ||
## KNN | ||
用kd树来查找k个近邻点 | ||
代码: https://github.com/acrafter/static_study/blob/master/knn/kd_tree.py | ||
## 朴素贝叶斯 | ||
为什么叫朴素贝叶斯呢? | ||
代码: https://github.com/acrafter/static_study/blob/master/Bayes.py | ||
## 决策树 | ||
笔记: https://github.com/acrafter/static_study/blob/master/decision_tree/决策树.pdf | ||
代码: https://github.com/acrafter/static_study/blob/master/decision_tree/decision_tree.py | ||
## 逻辑回归 | ||
笔记: | ||
https://github.com/acrafter/static_study/blob/master/logistic_regression/浅析Logistic Regression.pdf 来源: https://chenrudan.github.io/blog/2016/01/09/logisticregression.html | ||
https://github.com/acrafter/static_study/blob/master/logistic_regression/Logistic Regression 基础.pdf 来源: http://www.cnblogs.com/sparkwen/p/3441197.html | ||
代码 | ||
https://github.com/acrafter/static_study/blob/master/logistic_regression/logistic_regression.py | ||
## 最大熵 | ||
笔记: | ||
参考自vimsky.com | ||
https://github.com/acrafter/static_study/blob/master/max_entropy/揭开机器学习的面纱:最大熵模型100行代码实现[Python版] - 纯净的天空.pdf | ||
https://github.com/acrafter/static_study/blob/master/max_entropy/最大熵模型简介[例子+推导+GIS求解].pdf | ||
代码: | ||
https://github.com/acrafter/static_study/blob/master/max_entropy/max_ent.py | ||
## 支持向量机 | ||
笔记: | ||
https://github.com/acrafter/static_study/blob/master/support_vector_machine/支持向量机.pdf | ||
代码: | ||
https://github.com/acrafter/static_study/blob/master/support_vector_machine/smo.py | ||
## 提升方法 | ||
笔记: | ||
https://github.com/acrafter/static_study/blob/master/adaboost/提升方法AdaBoost.pdf | ||
代码: | ||
https://github.com/acrafter/static_study/blob/master/support_vector_machine/adaboost/adaboost.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
# -*- coding:utf-8 -*- | ||
""" | ||
author: acrafter | ||
date: 17-5-26 | ||
adaboost用决策树做基函数等于提升树 | ||
""" | ||
from tree_stump import * | ||
|
||
from numpy import * | ||
|
||
|
||
class AdaBoost(object): | ||
def __init__(self): | ||
self.final_tree = None # 最后组合成的决策树 | ||
|
||
def fit(self, x, y, m_estimate=6): | ||
ctree = CombineTree() | ||
y_temp = y | ||
for i in range(m_estimate): | ||
tree = Stump() # 学习单节点树 | ||
tree.fit(x, y) | ||
ctree.add(tree) | ||
se, se_i = ctree.get_se(x, y_temp) | ||
|
||
x, y = x, se_i | ||
|
||
# print '迭代--------------', i | ||
# print 's', ctree.s | ||
# print 'c', ctree.c | ||
# print '平方误差', se | ||
# print '偏差', se_i | ||
self.final_tree = ctree | ||
|
||
if __name__ == '__main__': | ||
x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] | ||
y = [5.56, 5.70, 5.91, 6.40, 6.80, 7.05, 8.90, 8.70, 9.00, 9.05] | ||
x, y = array(x), array(y) | ||
|
||
ada = AdaBoost() | ||
ada.fit(x, y) | ||
print ada.final_tree.s | ||
print ada.final_tree.c | ||
print ada.final_tree.se # 最后的平方损失值 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
# -*- coding:utf-8 -*- | ||
""" | ||
author: acrafter | ||
date: 17-5-26 | ||
单个结点的树桩,训练过程为CART树的过程 | ||
""" | ||
from __future__ import division | ||
from itertools import izip | ||
|
||
from numpy import * | ||
|
||
|
||
class Stump(object): | ||
def __init__(self): | ||
self.s = None | ||
self.c_final = [] | ||
self.se = None # 当前这颗树的总平均损失 | ||
self.se_i = [] # 每个数据的平均损失 | ||
|
||
def fit(self, x, y): | ||
m = shape(x)[0] | ||
error_lst = [] | ||
c_lst = [] | ||
for i in range(1, m): # i等于切分点s | ||
error = 0 | ||
c_1 = sum(y[:i]) / i | ||
c_2 = sum(y[i:]) / (m - i) | ||
for x_i in range(i): | ||
error += (y[x_i]-c_1)**2 | ||
for x_i in range(i, m): | ||
error += (y[x_i]-c_2)**2 | ||
error_lst.append(error) | ||
c_lst.append([c_1, c_2]) | ||
|
||
min_error = min(error_lst) | ||
min_index = error_lst.index(min_error) | ||
|
||
self.s = x[min_index] | ||
self.c_final = c_lst[min_index] | ||
self.get_se(x, y) | ||
# print self.error_lst | ||
|
||
return self.s, self.c_final # 返回最小切分点s和c1 c2 | ||
|
||
def get_se(self, x, y): | ||
"""计算当前树平方损失值""" | ||
se = 0 | ||
se_i = [] | ||
for x_i, y in izip(x, y): | ||
pred = self.predict(x_i) | ||
se += (pred - y)**2 | ||
se_i.append(y-pred) | ||
self.se_i = se_i | ||
self.se = se | ||
|
||
def predict(self, x): | ||
if x <= self.s: | ||
return self.c_final[0] | ||
else: | ||
return self.c_final[1] | ||
|
||
|
||
class CombineTree(object): | ||
"""将多个单节点树结合""" | ||
def __init__(self): | ||
self.stumps = [] | ||
self.s = [] | ||
self.c = [] | ||
self.se = None | ||
self.se_i = None # 每个数据点的残差,用于下一次的单节点树训练 | ||
|
||
def add(self, stump): | ||
self.stumps.append(stump) | ||
self.s.append(stump.s) | ||
self.s.sort() | ||
self.s = list(set(self.s)) | ||
c = zeros((len(self.s)+1, 1)) | ||
for stump in self.stumps: | ||
id = self.s.index(stump.s) | ||
c[:id+1] += stump.c_final[0] | ||
c[id+1:] += stump.c_final[1] | ||
self.c = c | ||
|
||
def predict(self, x): | ||
for i, s in enumerate(self.s): | ||
if x <= s: | ||
return self.c[i] | ||
else: | ||
return self.c[-1] | ||
|
||
def get_se(self, x, y): | ||
"""计算联合树平方损失值""" | ||
se = 0 | ||
se_i = [] | ||
for x_i, y in izip(x, y): | ||
pred = self.predict(x_i) | ||
se += (pred - y) ** 2 | ||
se_i.append(y-pred) | ||
self.se_i = se_i | ||
self.se = se | ||
return self.se, self.se_i | ||
|
||
|
||
if __name__ == '__main__': | ||
# 例8.2 | ||
x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] | ||
y = [5.56, 5.70, 5.91, 6.40, 6.80, 7.05, 8.90, 8.70, 9.00, 9.05] | ||
x, y = array(x), array(y) | ||
# tree = Stump() | ||
# print tree.fit(x, y) | ||
# print tree.se, tree.se_i | ||
|
||
ctree = CombineTree() | ||
s1 = Stump() | ||
s1.s, s1.c_final = 6.5, [6.24, 8.91] | ||
s2 = Stump() | ||
s2.s, s2.c_final = 3.5, [-0.52, 0.22] | ||
s3 = Stump() | ||
s3.s, s3.c_final = 6.5, [0.15, -0.22] | ||
ctree.add(s1) | ||
print ctree.get_se(x, y) | ||
ctree.add(s2) | ||
print ctree.get_se(x, y) | ||
ctree.add(s3) | ||
print ctree.get_se(x, y) |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
# -*- coding:utf-8 -*- | ||
""" | ||
author: acrafter | ||
date: 17-4-26 | ||
决策树,id3 | ||
""" | ||
from __future__ import division | ||
|
||
import math | ||
from itertools import izip | ||
from collections import Counter, defaultdict | ||
|
||
import numpy as np | ||
|
||
|
||
class Node(object): | ||
def __init__(self, feature): | ||
self.feature = feature | ||
self.value = {} | ||
|
||
|
||
class DecisionTree(object): | ||
def __init__(self): | ||
self._label_counter = Counter() # 对应C_k | ||
self._feature_counter = defaultdict(Counter) # 对应D_i | ||
self._feature_label_counter = defaultdict(Counter) # 对应D_ik | ||
|
||
self._entropy = 0 | ||
self.information_gain = {} | ||
|
||
def _fit(self, x, y): | ||
d = 0 | ||
for x, y in izip(x, y): | ||
d += 1 | ||
self._label_counter[y] += 1 | ||
for feature, value in enumerate(x): | ||
self._feature_counter[feature][value] += 1 | ||
self._feature_label_counter[(feature, value)][y] += 1 | ||
|
||
for _, c_i in self._label_counter.iteritems(): | ||
self._entropy += -1*(abs(c_i)/abs(d))*math.log(abs(c_i)/abs(d), 2) | ||
|
||
for feature, value_count in self._feature_counter.iteritems(): | ||
_condition_value = 0 | ||
value_count = value_count.iteritems() | ||
for value, count in value_count: | ||
for label, count2 in self._feature_label_counter[(feature, value)].iteritems(): | ||
_condition_value += (abs(count)/abs(d))*(abs(count2)/abs(count))*math.log(abs(count2)/abs(count), 2) | ||
_condition_value *= -1 | ||
self.information_gain[feature] = self._entropy - _condition_value | ||
|
||
return True | ||
|
||
def build_tree(self, x, y): | ||
# 重新初始化 | ||
self._label_counter = Counter() | ||
self._feature_counter = defaultdict(Counter) | ||
self._feature_label_counter = defaultdict(Counter) | ||
self._entropy = 0 | ||
self.information_gain = {} | ||
|
||
self._fit(x, y) | ||
self.information_gain = sorted(self.information_gain.iteritems(), key=lambda x: x[1], reverse=True) | ||
print '各个特征的信息增益\n', decision_tree.information_gain | ||
|
||
feature = self.information_gain[0][0] | ||
print '选择第{0}个特征'.format(feature) | ||
node = Node(feature) | ||
|
||
for value, counter in self._feature_counter[feature].iteritems(): | ||
if len(self._feature_label_counter[(feature, value)]) == 1: | ||
node.value[value] = self._feature_label_counter[(feature, value)].keys()[0] | ||
print '取值', value, '叶节点', node.value[value] | ||
else: | ||
temp = x[:, feature] == value | ||
x = x[temp] | ||
y = y[temp] | ||
node.value[value] = self.build_tree(x, y) | ||
|
||
return node | ||
|
||
|
||
if __name__ == '__main__': | ||
# 书中的贷款申请数据表,数值化, 特征为 0,1,2,3 | ||
# 青年,中年,老年 = 1,2,3 | ||
# 有工作,无 = 1, 2 | ||
# 有房子,无 = 1, 2 | ||
# 信贷情况, 一般,好,非常好 = 1, 2, 3 | ||
data = [[1, 2, 2, 1], | ||
[1, 2, 2, 2], | ||
[1, 1, 2, 2], | ||
[1, 1, 1, 1], | ||
[1, 2, 2, 1], | ||
[2, 2, 2, 1], | ||
[2, 2, 2, 2], | ||
[2, 1, 1, 2], | ||
[2, 2, 1, 3], | ||
[2, 2, 1, 3], | ||
[3, 2, 1, 3], | ||
[3, 2, 1, 2], | ||
[3, 1, 2, 2], | ||
[3, 1, 2, 3], | ||
[3, 2, 2, 1]] | ||
label = [0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0] | ||
data, label = np.array(data), np.array(label) | ||
|
||
decision_tree = DecisionTree() | ||
id3_tree = decision_tree.build_tree(data, label) | ||
|
||
"""输出 | ||
各个特征的信息增益 | ||
[(2, 0.4199730940219748), (3, 0.36298956253708536), (1, 0.32365019815155616), (0, 0.08300749985576883)] | ||
选择第2个特征 | ||
取值 1 叶节点 1 | ||
各个特征的信息增益 | ||
[(1, 0.9182958340544896), (3, 0.47385138961004514), (0, 0.2516291673878229), (2, 0.0)] | ||
选择第1个特征 | ||
取值 1 叶节点 1 | ||
取值 2 叶节点 0 | ||
""" |
Binary file not shown.
Oops, something went wrong.