Skip to content

Commit

Permalink
nothing
Browse files Browse the repository at this point in the history
  • Loading branch information
zzq2015 committed May 31, 2017
1 parent 2fa4e30 commit 23760c7
Show file tree
Hide file tree
Showing 28 changed files with 1,460 additions and 2 deletions.
48 changes: 48 additions & 0 deletions Bayes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# -*- coding:utf-8 -*-
"""
author: acrafter
date: 17-4-19
贝叶斯分类器,这里没有加平滑
"""
from __future__ import division

from itertools import izip
from collections import Counter


class Bayes(object):
def __init__(self, x, y):
self.x = x
self.y = y
self.class_counter = Counter()
self.pre_proba_counter = Counter()
self.class_proba = {}
self.pre_proba = {}

def fit(self):
for y, x in izip(self.y, self.x):
self.class_counter[y] += 1
for x_l in x:
self.pre_proba_counter[(y, x_l)] += 1
self.class_proba = {y: self.class_counter[y]/len(self.y) for y in self.class_counter}
self.pre_proba.update({y: self.pre_proba_counter[y]/self.class_counter[y[0]] for y in
self.pre_proba_counter})

def predict(self, x):
be_proba = {}
for y in self.class_proba:
y_proba = self.class_proba[y]
for x_l in x:
y_proba *= self.pre_proba[(y, x_l)]
be_proba[y] = y_proba
for y_class, be_proba in sorted(be_proba.iteritems(), key=lambda x: x[1], reverse=True):
print y_class, be_proba


if __name__ == '__main__':
data = [[1, 's'], [1, 'M'], [1, 'M'], [1, 's'], [1, 's'], [2, 's'], [2, 'm'], [2, 'm'],
[2, 'l'], [2, 'l'], [3, 'l'], [3, 'm'], [3, 'm'], [3, 'l'], [3, 'l']]
label = [-1, -1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, -1]
b = Bayes(data, label)
b.fit()
b.predict([2, 's'])
40 changes: 38 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,38 @@
# static_study
这里是《统计学习方法》的读书笔记,尝试自己实现里面的轮子。其实任何自己实现的轮子并不一定具备实际应用的能力,目的是通过代码来学习算法!
# 《统计学习方法》的代码实现。
这里是《统计学习方法》的读书笔记,尝试自己实现里面的轮子。其实任何自己实现的轮子并不一定具备实际应用的能力,目的是通过代码来学习算法!:smile:
不全是自己写,参考均有说明!O(∩_∩)O
# 笔记与代码
## 感知机
代码: https://github.com/acrafter/static_study/blob/master/perceptron.py
## KNN
用kd树来查找k个近邻点
代码: https://github.com/acrafter/static_study/blob/master/knn/kd_tree.py
## 朴素贝叶斯
为什么叫朴素贝叶斯呢?
代码: https://github.com/acrafter/static_study/blob/master/Bayes.py
## 决策树
笔记: https://github.com/acrafter/static_study/blob/master/decision_tree/决策树.pdf
代码: https://github.com/acrafter/static_study/blob/master/decision_tree/decision_tree.py
## 逻辑回归
笔记:
https://github.com/acrafter/static_study/blob/master/logistic_regression/浅析Logistic Regression.pdf 来源: https://chenrudan.github.io/blog/2016/01/09/logisticregression.html
https://github.com/acrafter/static_study/blob/master/logistic_regression/Logistic Regression 基础.pdf 来源: http://www.cnblogs.com/sparkwen/p/3441197.html
代码
https://github.com/acrafter/static_study/blob/master/logistic_regression/logistic_regression.py
## 最大熵
笔记:
参考自vimsky.com
https://github.com/acrafter/static_study/blob/master/max_entropy/揭开机器学习的面纱:最大熵模型100行代码实现[Python版] - 纯净的天空.pdf
https://github.com/acrafter/static_study/blob/master/max_entropy/最大熵模型简介[例子+推导+GIS求解].pdf
代码:
https://github.com/acrafter/static_study/blob/master/max_entropy/max_ent.py
## 支持向量机
笔记:
https://github.com/acrafter/static_study/blob/master/support_vector_machine/支持向量机.pdf
代码:
https://github.com/acrafter/static_study/blob/master/support_vector_machine/smo.py
## 提升方法
笔记:
https://github.com/acrafter/static_study/blob/master/adaboost/提升方法AdaBoost.pdf
代码:
https://github.com/acrafter/static_study/blob/master/support_vector_machine/adaboost/adaboost.py
43 changes: 43 additions & 0 deletions adaboost/adaboost.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# -*- coding:utf-8 -*-
"""
author: acrafter
date: 17-5-26
adaboost用决策树做基函数等于提升树
"""
from tree_stump import *

from numpy import *


class AdaBoost(object):
def __init__(self):
self.final_tree = None # 最后组合成的决策树

def fit(self, x, y, m_estimate=6):
ctree = CombineTree()
y_temp = y
for i in range(m_estimate):
tree = Stump() # 学习单节点树
tree.fit(x, y)
ctree.add(tree)
se, se_i = ctree.get_se(x, y_temp)

x, y = x, se_i

# print '迭代--------------', i
# print 's', ctree.s
# print 'c', ctree.c
# print '平方误差', se
# print '偏差', se_i
self.final_tree = ctree

if __name__ == '__main__':
x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
y = [5.56, 5.70, 5.91, 6.40, 6.80, 7.05, 8.90, 8.70, 9.00, 9.05]
x, y = array(x), array(y)

ada = AdaBoost()
ada.fit(x, y)
print ada.final_tree.s
print ada.final_tree.c
print ada.final_tree.se # 最后的平方损失值
125 changes: 125 additions & 0 deletions adaboost/tree_stump.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
# -*- coding:utf-8 -*-
"""
author: acrafter
date: 17-5-26
单个结点的树桩,训练过程为CART树的过程
"""
from __future__ import division
from itertools import izip

from numpy import *


class Stump(object):
def __init__(self):
self.s = None
self.c_final = []
self.se = None # 当前这颗树的总平均损失
self.se_i = [] # 每个数据的平均损失

def fit(self, x, y):
m = shape(x)[0]
error_lst = []
c_lst = []
for i in range(1, m): # i等于切分点s
error = 0
c_1 = sum(y[:i]) / i
c_2 = sum(y[i:]) / (m - i)
for x_i in range(i):
error += (y[x_i]-c_1)**2
for x_i in range(i, m):
error += (y[x_i]-c_2)**2
error_lst.append(error)
c_lst.append([c_1, c_2])

min_error = min(error_lst)
min_index = error_lst.index(min_error)

self.s = x[min_index]
self.c_final = c_lst[min_index]
self.get_se(x, y)
# print self.error_lst

return self.s, self.c_final # 返回最小切分点s和c1 c2

def get_se(self, x, y):
"""计算当前树平方损失值"""
se = 0
se_i = []
for x_i, y in izip(x, y):
pred = self.predict(x_i)
se += (pred - y)**2
se_i.append(y-pred)
self.se_i = se_i
self.se = se

def predict(self, x):
if x <= self.s:
return self.c_final[0]
else:
return self.c_final[1]


class CombineTree(object):
"""将多个单节点树结合"""
def __init__(self):
self.stumps = []
self.s = []
self.c = []
self.se = None
self.se_i = None # 每个数据点的残差,用于下一次的单节点树训练

def add(self, stump):
self.stumps.append(stump)
self.s.append(stump.s)
self.s.sort()
self.s = list(set(self.s))
c = zeros((len(self.s)+1, 1))
for stump in self.stumps:
id = self.s.index(stump.s)
c[:id+1] += stump.c_final[0]
c[id+1:] += stump.c_final[1]
self.c = c

def predict(self, x):
for i, s in enumerate(self.s):
if x <= s:
return self.c[i]
else:
return self.c[-1]

def get_se(self, x, y):
"""计算联合树平方损失值"""
se = 0
se_i = []
for x_i, y in izip(x, y):
pred = self.predict(x_i)
se += (pred - y) ** 2
se_i.append(y-pred)
self.se_i = se_i
self.se = se
return self.se, self.se_i


if __name__ == '__main__':
# 例8.2
x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
y = [5.56, 5.70, 5.91, 6.40, 6.80, 7.05, 8.90, 8.70, 9.00, 9.05]
x, y = array(x), array(y)
# tree = Stump()
# print tree.fit(x, y)
# print tree.se, tree.se_i

ctree = CombineTree()
s1 = Stump()
s1.s, s1.c_final = 6.5, [6.24, 8.91]
s2 = Stump()
s2.s, s2.c_final = 3.5, [-0.52, 0.22]
s3 = Stump()
s3.s, s3.c_final = 6.5, [0.15, -0.22]
ctree.add(s1)
print ctree.get_se(x, y)
ctree.add(s2)
print ctree.get_se(x, y)
ctree.add(s3)
print ctree.get_se(x, y)
Binary file added adaboost/tree_stump.pyc
Binary file not shown.
Binary file added adaboost/提升方法AdaBoost.pdf
Binary file not shown.
120 changes: 120 additions & 0 deletions decision_tree/decision_tree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# -*- coding:utf-8 -*-
"""
author: acrafter
date: 17-4-26
决策树,id3
"""
from __future__ import division

import math
from itertools import izip
from collections import Counter, defaultdict

import numpy as np


class Node(object):
def __init__(self, feature):
self.feature = feature
self.value = {}


class DecisionTree(object):
def __init__(self):
self._label_counter = Counter() # 对应C_k
self._feature_counter = defaultdict(Counter) # 对应D_i
self._feature_label_counter = defaultdict(Counter) # 对应D_ik

self._entropy = 0
self.information_gain = {}

def _fit(self, x, y):
d = 0
for x, y in izip(x, y):
d += 1
self._label_counter[y] += 1
for feature, value in enumerate(x):
self._feature_counter[feature][value] += 1
self._feature_label_counter[(feature, value)][y] += 1

for _, c_i in self._label_counter.iteritems():
self._entropy += -1*(abs(c_i)/abs(d))*math.log(abs(c_i)/abs(d), 2)

for feature, value_count in self._feature_counter.iteritems():
_condition_value = 0
value_count = value_count.iteritems()
for value, count in value_count:
for label, count2 in self._feature_label_counter[(feature, value)].iteritems():
_condition_value += (abs(count)/abs(d))*(abs(count2)/abs(count))*math.log(abs(count2)/abs(count), 2)
_condition_value *= -1
self.information_gain[feature] = self._entropy - _condition_value

return True

def build_tree(self, x, y):
# 重新初始化
self._label_counter = Counter()
self._feature_counter = defaultdict(Counter)
self._feature_label_counter = defaultdict(Counter)
self._entropy = 0
self.information_gain = {}

self._fit(x, y)
self.information_gain = sorted(self.information_gain.iteritems(), key=lambda x: x[1], reverse=True)
print '各个特征的信息增益\n', decision_tree.information_gain

feature = self.information_gain[0][0]
print '选择第{0}个特征'.format(feature)
node = Node(feature)

for value, counter in self._feature_counter[feature].iteritems():
if len(self._feature_label_counter[(feature, value)]) == 1:
node.value[value] = self._feature_label_counter[(feature, value)].keys()[0]
print '取值', value, '叶节点', node.value[value]
else:
temp = x[:, feature] == value
x = x[temp]
y = y[temp]
node.value[value] = self.build_tree(x, y)

return node


if __name__ == '__main__':
# 书中的贷款申请数据表,数值化, 特征为 0,1,2,3
# 青年,中年,老年 = 1,2,3
# 有工作,无 = 1, 2
# 有房子,无 = 1, 2
# 信贷情况, 一般,好,非常好 = 1, 2, 3
data = [[1, 2, 2, 1],
[1, 2, 2, 2],
[1, 1, 2, 2],
[1, 1, 1, 1],
[1, 2, 2, 1],
[2, 2, 2, 1],
[2, 2, 2, 2],
[2, 1, 1, 2],
[2, 2, 1, 3],
[2, 2, 1, 3],
[3, 2, 1, 3],
[3, 2, 1, 2],
[3, 1, 2, 2],
[3, 1, 2, 3],
[3, 2, 2, 1]]
label = [0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0]
data, label = np.array(data), np.array(label)

decision_tree = DecisionTree()
id3_tree = decision_tree.build_tree(data, label)

"""输出
各个特征的信息增益
[(2, 0.4199730940219748), (3, 0.36298956253708536), (1, 0.32365019815155616), (0, 0.08300749985576883)]
选择第2个特征
取值 1 叶节点 1
各个特征的信息增益
[(1, 0.9182958340544896), (3, 0.47385138961004514), (0, 0.2516291673878229), (2, 0.0)]
选择第1个特征
取值 1 叶节点 1
取值 2 叶节点 0
"""
Binary file added decision_tree/决策树.pdf
Binary file not shown.
Loading

0 comments on commit 23760c7

Please sign in to comment.