Skip to content

Commit

Permalink
Merge pull request apachecn#389 from cclauss/patch-2
Browse files Browse the repository at this point in the history
有心了,谢谢小哥哥 | print() is a function in Python 3
  • Loading branch information
jiangzhonglian authored Jun 7, 2018
2 parents 62593ed + 736778f commit d5379f8
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 31 deletions.
26 changes: 13 additions & 13 deletions src/py3.x/ML/13.PCA/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,11 @@ def pca(dataMat, topNfeat=9999999):

# 计算每一列的均值
meanVals = mean(dataMat, axis=0)
# print ('meanVals', meanVals)
# print('meanVals', meanVals)

# 每个向量同时都减去 均值
meanRemoved = dataMat - meanVals
# print ('meanRemoved=', meanRemoved)
# print('meanRemoved=', meanRemoved)

# cov协方差=[(x1-x均值)*(y1-y均值)+(x2-x均值)*(y2-y均值)+...+(xn-x均值)*(yn-y均值)+]/(n-1)
'''
Expand All @@ -53,8 +53,8 @@ def pca(dataMat, topNfeat=9999999):

# eigVals为特征值, eigVects为特征向量
eigVals, eigVects = linalg.eig(mat(covMat))
# print ('eigVals=', eigVals)
# print( 'eigVects=', eigVects)
# print('eigVals=', eigVals)
# print('eigVects=', eigVects)
# 对特征值,进行从小到大的排序,返回从小到大的index序号
# 特征值的逆序就可以得到topNfeat个最大的特征向量
'''
Expand All @@ -70,20 +70,20 @@ def pca(dataMat, topNfeat=9999999):
array([0, 2, 1])
'''
eigValInd = argsort(eigVals)
# print ('eigValInd1=', eigValInd)
# print('eigValInd1=', eigValInd)

# -1表示倒序,返回topN的特征值[-1 到 -(topNfeat+1) 但是不包括-(topNfeat+1)本身的倒叙]
eigValInd = eigValInd[:-(topNfeat+1):-1]
# print ('eigValInd2=', eigValInd)
# print('eigValInd2=', eigValInd)
# 重组 eigVects 最大到最小
redEigVects = eigVects[:, eigValInd]
# print ('redEigVects=', redEigVects.T)
# print('redEigVects=', redEigVects.T)
# 将数据转换到新空间
# print( "---", shape(meanRemoved), shape(redEigVects))
lowDDataMat = meanRemoved * redEigVects
reconMat = (lowDDataMat * redEigVects.T) + meanVals
# print ('lowDDataMat=', lowDDataMat)
# print ('reconMat=', reconMat)
# print('lowDDataMat=', lowDDataMat)
# print('reconMat=', reconMat)
return lowDDataMat, reconMat


Expand Down Expand Up @@ -130,7 +130,7 @@ def analyse_data(dataMat):
最后,我们可能会注意到有一些小的负值,他们主要源自数值误差应该四舍五入成0.
'''
print '主成分:%s, 方差占比:%s%%, 累积方差占比:%s%%' % (format(i+1, '2.0f'), format(line_cov_score/cov_all_score*100, '4.2f'), format(sum_cov_score/cov_all_score*100, '4.1f'))
print('主成分:%s, 方差占比:%s%%, 累积方差占比:%s%%' % (format(i+1, '2.0f'), format(line_cov_score/cov_all_score*100, '4.2f'), format(sum_cov_score/cov_all_score*100, '4.1f')))


if __name__ == "__main__":
Expand All @@ -140,14 +140,14 @@ def analyse_data(dataMat):
# lowDmat, reconMat = pca(dataMat, 1)
# # 只需要2个特征向量,和原始数据一致,没任何变化
# # lowDmat, reconMat = pca(dataMat, 2)
# # print (shape(lowDmat))
# # print(shape(lowDmat))
# show_picture(dataMat, reconMat)

# 利用PCA对半导体制造数据降维
dataMat = replaceNanWithMean()
print (shape(dataMat))
print(shape(dataMat))
# 分析数据
analyse_data(dataMat)
# lowDmat, reconMat = pca(dataMat, 20)
# print (shape(lowDmat))
# print(shape(lowDmat))
# show_picture(dataMat, reconMat)
14 changes: 7 additions & 7 deletions src/py3.x/ML/7.AdaBoost/sklearn-adaboost-demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,15 @@
plt.legend()
plt.show()

print 'y---', type(y[0]), len(y), y[:4]
print 'y_1---', type(y_1[0]), len(y_1), y_1[:4]
print 'y_2---', type(y_2[0]), len(y_2), y_2[:4]
print('y---', type(y[0]), len(y), y[:4])
print('y_1---', type(y_1[0]), len(y_1), y_1[:4])
print('y_2---', type(y_2[0]), len(y_2), y_2[:4])

# 适合2分类
y_true = np.array([0, 0, 1, 1])
y_scores = np.array([0.1, 0.4, 0.35, 0.8])
print 'y_scores---', type(y_scores[0]), len(y_scores), y_scores
print metrics.roc_auc_score(y_true, y_scores)
print('y_scores---', type(y_scores[0]), len(y_scores), y_scores)
print(metrics.roc_auc_score(y_true, y_scores))

# print "-" * 100
# print metrics.roc_auc_score(y[:1], y_2[:1])
# print("-" * 100)
# print(metrics.roc_auc_score(y[:1], y_2[:1]))
12 changes: 6 additions & 6 deletions src/py3.x/ML/7.RandomForest/randomForest.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def get_split(dataset, n_features):
# 左右两边的数量越一样,说明数据区分度不高,gini系数越大
if gini < b_score:
b_index, b_value, b_score, b_groups = index, row[index], gini, groups # 最后得到最优的分类特征 b_index,分类特征值 b_value,分类结果 b_groups。b_value 为分错的代价成本
# print b_score
# print(b_score)
return {'index': b_index, 'value': b_value, 'groups': b_groups}


Expand Down Expand Up @@ -303,7 +303,7 @@ def evaluate_algorithm(dataset, algorithm, n_folds, *args):

# 加载数据
dataset = loadDataSet('input/7.RandomForest/sonar-all-data.txt')
# print dataset
# print(dataset)

n_folds = 5 # 分成5份数据,进行交叉验证
max_depth = 20 # 调参(自己修改) #决策树深度不能太深,不然容易导致过拟合
Expand All @@ -315,7 +315,7 @@ def evaluate_algorithm(dataset, algorithm, n_folds, *args):
scores = evaluate_algorithm(dataset, random_forest, n_folds, max_depth, min_size, sample_size, n_trees, n_features)
# 每一次执行本文件时都能产生同一个随机数
seed(1)
print 'random=', random()
print 'Trees: %d' % n_trees
print 'Scores: %s' % scores
print 'Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores)))
print('random=', random())
print('Trees: %d' % n_trees)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))
10 changes: 5 additions & 5 deletions tools/DecisionTree_getInfoGain.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def calcShannonEnt(dataSet):
"""
# 求list的长度,表示计算参与训练的数据量
numEntries = len(dataSet)
# print type(dataSet), 'numEntries: ', numEntries
# print(type(dataSet), 'numEntries: ', numEntries)

# 计算分类标签label出现的次数
labelCounts = {}
Expand All @@ -26,15 +26,15 @@ def calcShannonEnt(dataSet):
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
# print '-----', featVec, labelCounts
# print('-----', featVec, labelCounts)

# 对于label标签的占比,求出label标签的香农熵
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key])/numEntries
# log base 2
shannonEnt -= prob * log(prob, 2)
# print '---', prob, prob * log(prob, 2), shannonEnt
# print('---', prob, prob * log(prob, 2), shannonEnt)
return shannonEnt


Expand Down Expand Up @@ -100,7 +100,7 @@ def getFeatureShannonEnt(dataSet, labels):
# gain[信息增益]=0, 表示与类别相同,无需其他的分类
# gain[信息增益]=baseEntropy, 表示分类和没分类没有区别
infoGain = baseEntropy - newEntropy
# print infoGain
# print(infoGain)
if (infoGain > bestInfoGain):
endEntropy = newEntropy
bestInfoGain = infoGain
Expand All @@ -120,5 +120,5 @@ def getFeatureShannonEnt(dataSet, labels):
infoGain1 = getFeatureShannonEnt(dataSet1, labels)
infoGain2 = getFeatureShannonEnt(dataSet2, labels)
infoGain3 = getFeatureShannonEnt(dataSet3, labels)
print '信息增益: \n\t%s, \n\t%s, \n\t%s' % (infoGain1, infoGain2, infoGain3)
print('信息增益: \n\t%s, \n\t%s, \n\t%s' % (infoGain1, infoGain2, infoGain3))

0 comments on commit d5379f8

Please sign in to comment.