nothing

wanglukun · May 31, 2017 · 23760c7 · 23760c7
1 parent 2fa4e30
commit 23760c7
Show file tree

Hide file tree

Showing 28 changed files with 1,460 additions and 2 deletions.
diff --git a/Bayes.py b/Bayes.py
@@ -0,0 +1,48 @@
+# -*- coding:utf-8 -*-
+"""
+author: acrafter
+date: 17-4-19
+贝叶斯分类器,这里没有加平滑
+"""
+from __future__ import division
+
+from itertools import izip
+from collections import Counter
+
+
+class Bayes(object):
+    def __init__(self, x, y):
+        self.x = x
+        self.y = y
+        self.class_counter = Counter()
+        self.pre_proba_counter = Counter()
+        self.class_proba = {}
+        self.pre_proba = {}
+
+    def fit(self):
+        for y, x in izip(self.y, self.x):
+            self.class_counter[y] += 1
+            for x_l in x:
+                self.pre_proba_counter[(y, x_l)] += 1
+        self.class_proba = {y: self.class_counter[y]/len(self.y) for y in self.class_counter}
+        self.pre_proba.update({y: self.pre_proba_counter[y]/self.class_counter[y[0]] for y in
+                                       self.pre_proba_counter})
+
+    def predict(self, x):
+        be_proba = {}
+        for y in self.class_proba:
+            y_proba = self.class_proba[y]
+            for x_l in x:
+                y_proba *= self.pre_proba[(y, x_l)]
+            be_proba[y] = y_proba
+        for y_class, be_proba in sorted(be_proba.iteritems(), key=lambda x: x[1], reverse=True):
+            print y_class, be_proba
+
+
+if __name__ == '__main__':
+    data = [[1, 's'], [1, 'M'], [1, 'M'], [1, 's'], [1, 's'], [2, 's'], [2, 'm'], [2, 'm'],
+            [2, 'l'], [2, 'l'], [3, 'l'], [3, 'm'], [3, 'm'], [3, 'l'], [3, 'l']]
+    label = [-1, -1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, -1]
+    b = Bayes(data, label)
+    b.fit()
+    b.predict([2, 's'])
diff --git a/README.md b/README.md
@@ -1,2 +1,38 @@
-# static_study
-这里是《统计学习方法》的读书笔记，尝试自己实现里面的轮子。其实任何自己实现的轮子并不一定具备实际应用的能力，目的是通过代码来学习算法！
+# 《统计学习方法》的代码实现。  
+这里是《统计学习方法》的读书笔记，尝试自己实现里面的轮子。其实任何自己实现的轮子并不一定具备实际应用的能力，目的是通过代码来学习算法！:smile:  
+不全是自己写，参考均有说明！O(∩_∩)O
+# 笔记与代码  
+## 感知机  
+代码: https://github.com/acrafter/static_study/blob/master/perceptron.py  
+## KNN  
+用kd树来查找k个近邻点  
+代码: https://github.com/acrafter/static_study/blob/master/knn/kd_tree.py   
+## 朴素贝叶斯  
+为什么叫朴素贝叶斯呢？  
+代码: https://github.com/acrafter/static_study/blob/master/Bayes.py   
+## 决策树  
+笔记: https://github.com/acrafter/static_study/blob/master/decision_tree/决策树.pdf  
+代码: https://github.com/acrafter/static_study/blob/master/decision_tree/decision_tree.py  
+## 逻辑回归  
+笔记:   
+https://github.com/acrafter/static_study/blob/master/logistic_regression/浅析Logistic Regression.pdf 来源： https://chenrudan.github.io/blog/2016/01/09/logisticregression.html  
+https://github.com/acrafter/static_study/blob/master/logistic_regression/Logistic Regression 基础.pdf 来源： http://www.cnblogs.com/sparkwen/p/3441197.html  
+代码  
+https://github.com/acrafter/static_study/blob/master/logistic_regression/logistic_regression.py  
+## 最大熵 
+笔记:  
+参考自vimsky.com  
+https://github.com/acrafter/static_study/blob/master/max_entropy/揭开机器学习的面纱：最大熵模型100行代码实现[Python版] - 纯净的天空.pdf  
+https://github.com/acrafter/static_study/blob/master/max_entropy/最大熵模型简介[例子+推导+GIS求解].pdf  
+代码：  
+https://github.com/acrafter/static_study/blob/master/max_entropy/max_ent.py  
+## 支持向量机  
+笔记：  
+https://github.com/acrafter/static_study/blob/master/support_vector_machine/支持向量机.pdf  
+代码：  
+https://github.com/acrafter/static_study/blob/master/support_vector_machine/smo.py  
+## 提升方法  
+笔记：  
+https://github.com/acrafter/static_study/blob/master/adaboost/提升方法AdaBoost.pdf  
+代码：  
+https://github.com/acrafter/static_study/blob/master/support_vector_machine/adaboost/adaboost.py  
diff --git a/adaboost/adaboost.py b/adaboost/adaboost.py
@@ -0,0 +1,43 @@
+# -*- coding:utf-8 -*-
+"""
+author: acrafter
+date: 17-5-26
+adaboost用决策树做基函数等于提升树
+"""
+from tree_stump import *
+
+from numpy import *
+
+
+class AdaBoost(object):
+    def __init__(self):
+        self.final_tree = None  # 最后组合成的决策树
+
+    def fit(self, x, y, m_estimate=6):
+        ctree = CombineTree()
+        y_temp = y
+        for i in range(m_estimate):
+            tree = Stump()  # 学习单节点树
+            tree.fit(x, y)
+            ctree.add(tree)
+            se, se_i = ctree.get_se(x, y_temp)
+
+            x, y = x, se_i
+
+            # print '迭代--------------', i
+            # print 's', ctree.s
+            # print 'c', ctree.c
+            # print '平方误差', se
+            # print '偏差', se_i
+        self.final_tree = ctree
+
+if __name__ == '__main__':
+    x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+    y = [5.56, 5.70, 5.91, 6.40, 6.80, 7.05, 8.90, 8.70, 9.00, 9.05]
+    x, y = array(x), array(y)
+
+    ada = AdaBoost()
+    ada.fit(x, y)
+    print ada.final_tree.s
+    print ada.final_tree.c
+    print ada.final_tree.se  # 最后的平方损失值
diff --git a/adaboost/tree_stump.py b/adaboost/tree_stump.py
@@ -0,0 +1,125 @@
+# -*- coding:utf-8 -*-
+"""
+author: acrafter
+date: 17-5-26
+单个结点的树桩,训练过程为CART树的过程
+"""
+from __future__ import division
+from itertools import izip
+
+from numpy import *
+
+
+class Stump(object):
+    def __init__(self):
+        self.s = None
+        self.c_final = []
+        self.se = None  # 当前这颗树的总平均损失
+        self.se_i = []  # 每个数据的平均损失
+
+    def fit(self, x, y):
+        m = shape(x)[0]
+        error_lst = []
+        c_lst = []
+        for i in range(1, m):  # i等于切分点s
+            error = 0
+            c_1 = sum(y[:i]) / i
+            c_2 = sum(y[i:]) / (m - i)
+            for x_i in range(i):
+                error += (y[x_i]-c_1)**2
+            for x_i in range(i, m):
+                error += (y[x_i]-c_2)**2
+            error_lst.append(error)
+            c_lst.append([c_1, c_2])
+
+        min_error = min(error_lst)
+        min_index = error_lst.index(min_error)
+
+        self.s = x[min_index]
+        self.c_final = c_lst[min_index]
+        self.get_se(x, y)
+        # print self.error_lst
+
+        return self.s, self.c_final  # 返回最小切分点s和c1 c2
+
+    def get_se(self, x, y):
+        """计算当前树平方损失值"""
+        se = 0
+        se_i = []
+        for x_i, y in izip(x, y):
+            pred = self.predict(x_i)
+            se += (pred - y)**2
+            se_i.append(y-pred)
+        self.se_i = se_i
+        self.se = se
+
+    def predict(self, x):
+        if x <= self.s:
+            return self.c_final[0]
+        else:
+            return self.c_final[1]
+
+
+class CombineTree(object):
+    """将多个单节点树结合"""
+    def __init__(self):
+        self.stumps = []
+        self.s = []
+        self.c = []
+        self.se = None
+        self.se_i = None  # 每个数据点的残差，用于下一次的单节点树训练
+
+    def add(self, stump):
+        self.stumps.append(stump)
+        self.s.append(stump.s)
+        self.s.sort()
+        self.s = list(set(self.s))
+        c = zeros((len(self.s)+1, 1))
+        for stump in self.stumps:
+            id = self.s.index(stump.s)
+            c[:id+1] += stump.c_final[0]
+            c[id+1:] += stump.c_final[1]
+        self.c = c
+
+    def predict(self, x):
+        for i, s in enumerate(self.s):
+            if x <= s:
+                return self.c[i]
+        else:
+            return self.c[-1]
+
+    def get_se(self, x, y):
+        """计算联合树平方损失值"""
+        se = 0
+        se_i = []
+        for x_i, y in izip(x, y):
+            pred = self.predict(x_i)
+            se += (pred - y) ** 2
+            se_i.append(y-pred)
+        self.se_i = se_i
+        self.se = se
+        return self.se, self.se_i
+
+
+if __name__ == '__main__':
+    # 例8.2
+    x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+    y = [5.56, 5.70, 5.91, 6.40, 6.80, 7.05, 8.90, 8.70, 9.00, 9.05]
+    x, y = array(x), array(y)
+    # tree = Stump()
+    # print tree.fit(x, y)
+    # print tree.se, tree.se_i
+
+    ctree = CombineTree()
+    s1 = Stump()
+    s1.s, s1.c_final = 6.5, [6.24, 8.91]
+    s2 = Stump()
+    s2.s, s2.c_final = 3.5, [-0.52, 0.22]
+    s3 = Stump()
+    s3.s, s3.c_final = 6.5, [0.15, -0.22]
+    ctree.add(s1)
+    print ctree.get_se(x, y)
+    ctree.add(s2)
+    print ctree.get_se(x, y)
+    ctree.add(s3)
+    print ctree.get_se(x, y)
diff --git a/adaboost/tree_stump.pyc b/adaboost/tree_stump.pyc
diff --git a/adaboost/提升方法AdaBoost.pdf b/adaboost/提升方法AdaBoost.pdf
diff --git a/decision_tree/decision_tree.py b/decision_tree/decision_tree.py
@@ -0,0 +1,120 @@
+# -*- coding:utf-8 -*-
+"""
+author: acrafter
+date: 17-4-26
+决策树，id3
+"""
+from __future__ import division
+
+import math
+from itertools import izip
+from collections import Counter, defaultdict
+
+import numpy as np
+
+
+class Node(object):
+    def __init__(self, feature):
+        self.feature = feature
+        self.value = {}
+
+
+class DecisionTree(object):
+    def __init__(self):
+        self._label_counter = Counter()  # 对应C_k
+        self._feature_counter = defaultdict(Counter)  # 对应D_i
+        self._feature_label_counter = defaultdict(Counter)  # 对应D_ik
+
+        self._entropy = 0
+        self.information_gain = {}
+
+    def _fit(self, x, y):
+        d = 0
+        for x, y in izip(x, y):
+            d += 1
+            self._label_counter[y] += 1
+            for feature, value in enumerate(x):
+                self._feature_counter[feature][value] += 1
+                self._feature_label_counter[(feature, value)][y] += 1
+
+        for _, c_i in self._label_counter.iteritems():
+            self._entropy += -1*(abs(c_i)/abs(d))*math.log(abs(c_i)/abs(d), 2)
+
+        for feature, value_count in self._feature_counter.iteritems():
+            _condition_value = 0
+            value_count = value_count.iteritems()
+            for value, count in value_count:
+                for label, count2 in self._feature_label_counter[(feature, value)].iteritems():
+                    _condition_value += (abs(count)/abs(d))*(abs(count2)/abs(count))*math.log(abs(count2)/abs(count), 2)
+            _condition_value *= -1
+            self.information_gain[feature] = self._entropy - _condition_value
+
+        return True
+
+    def build_tree(self, x, y):
+        # 重新初始化
+        self._label_counter = Counter()
+        self._feature_counter = defaultdict(Counter)
+        self._feature_label_counter = defaultdict(Counter)
+        self._entropy = 0
+        self.information_gain = {}
+
+        self._fit(x, y)
+        self.information_gain = sorted(self.information_gain.iteritems(), key=lambda x: x[1], reverse=True)
+        print '各个特征的信息增益\n', decision_tree.information_gain
+
+        feature = self.information_gain[0][0]
+        print '选择第{0}个特征'.format(feature)
+        node = Node(feature)
+
+        for value, counter in self._feature_counter[feature].iteritems():
+            if len(self._feature_label_counter[(feature, value)]) == 1:
+                node.value[value] = self._feature_label_counter[(feature, value)].keys()[0]
+                print '取值', value, '叶节点', node.value[value]
+            else:
+                temp = x[:, feature] == value
+                x = x[temp]
+                y = y[temp]
+                node.value[value] = self.build_tree(x, y)
+
+        return node
+
+
+if __name__ == '__main__':
+    # 书中的贷款申请数据表，数值化, 特征为 0,1,2,3
+    # 青年，中年，老年 = 1,2,3
+    # 有工作，无 = 1, 2
+    # 有房子，无 = 1, 2
+    # 信贷情况, 一般，好，非常好 = 1, 2, 3
+    data = [[1, 2, 2, 1],
+            [1, 2, 2, 2],
+            [1, 1, 2, 2],
+            [1, 1, 1, 1],
+            [1, 2, 2, 1],
+            [2, 2, 2, 1],
+            [2, 2, 2, 2],
+            [2, 1, 1, 2],
+            [2, 2, 1, 3],
+            [2, 2, 1, 3],
+            [3, 2, 1, 3],
+            [3, 2, 1, 2],
+            [3, 1, 2, 2],
+            [3, 1, 2, 3],
+            [3, 2, 2, 1]]
+    label = [0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0]
+    data, label = np.array(data), np.array(label)
+
+    decision_tree = DecisionTree()
+    id3_tree = decision_tree.build_tree(data, label)
+
+    """输出
+    各个特征的信息增益
+    [(2, 0.4199730940219748), (3, 0.36298956253708536), (1, 0.32365019815155616), (0, 0.08300749985576883)]
+    选择第2个特征
+    取值 1 叶节点 1
+    各个特征的信息增益
+    [(1, 0.9182958340544896), (3, 0.47385138961004514), (0, 0.2516291673878229), (2, 0.0)]
+    选择第1个特征
+    取值 1 叶节点 1
+    取值 2 叶节点 0
+    """
diff --git a/decision_tree/决策树.pdf b/decision_tree/决策树.pdf