1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
| import math import numpy as np
def createDataLH(): data = np.array([['青年', '否', '否', '一般']]) data = np.append(data, [['青年', '否', '否', '好']], axis = 0) data = np.append(data, [['青年', '是', '否', '好'] , ['青年', '是', '是', '一般'] , ['青年', '否', '否', '一般'] , ['中年', '否', '否', '一般'] , ['中年', '否', '否', '好'] , ['中年', '是', '是', '好'] , ['中年', '否', '是', '非常好'] , ['中年', '否', '是', '非常好'] , ['老年', '否', '是', '非常好'] , ['老年', '否', '是', '好'] , ['老年', '是', '否', '好'] , ['老年', '是', '否', '非常好'] , ['老年', '否', '否', '一般'] ], axis = 0) label = np.array(['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否']) name = np.array(['年龄', '有工作', '有房子', '信贷情况']) return data, label, name
def createDataXG20(): data = np.array([['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑'] , ['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑'] , ['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑'] , ['青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑'] , ['浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑'] , ['青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘'] , ['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘'] , ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑'] , ['乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑'] , ['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘'] , ['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑'] , ['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘'] , ['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑'] , ['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑'] , ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘'] , ['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑'] , ['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑']]) label = np.array(['是', '是', '是', '是', '是', '是', '是', '是', '否', '否', '否', '否', '否', '否', '否', '否', '否']) name = np.array(['色泽', '根蒂', '敲声', '纹理', '脐部', '触感']) return data, label, name
def splitXgData20(xgData, xgLabel): xgDataTrain = xgData[[0, 1, 2, 5, 6, 9, 13, 14, 15, 16],:] xgDataTest = xgData[[3, 4, 7, 8, 10, 11, 12],:] xgLabelTrain = xgLabel[[0, 1, 2, 5, 6, 9, 13, 14, 15, 16]] xgLabelTest = xgLabel[[3, 4, 7, 8, 10, 11, 12]] return xgDataTrain, xgLabelTrain, xgDataTest, xgLabelTest
equalNums = lambda x,y: 0 if x is None else x[x==y].size
def singleEntropy(x): """计算一个输入序列的信息熵""" x = np.asarray(x) xValues = set(x) entropy = 0 for xValue in xValues: p = equalNums(x, xValue) / x.size entropy -= p * math.log(p, 2) return entropy
def conditionnalEntropy(feature, y): """计算 某特征feature 条件下y的信息熵""" feature = np.asarray(feature) y = np.asarray(y) featureValues = set(feature) entropy = 0 for feat in featureValues: p = equalNums(feature, feat) / feature.size entropy += p * singleEntropy(y[feature == feat]) return entropy
def infoGain(feature, y): return singleEntropy(y) - conditionnalEntropy(feature, y)
def infoGainRatio(feature, y): return 0 if singleEntropy(feature) == 0 else infoGain(feature, y) / singleEntropy(feature)
lhData, lhLabel, lhName = createDataLH() print("书中H(D)为0.971,函数结果:" + str(round(singleEntropy(lhLabel), 3))) print("书中g(D, A1)为0.083,函数结果:" + str(round(infoGain(lhData[:,0] ,lhLabel), 3))) print("书中g(D, A2)为0.324,函数结果:" + str(round(infoGain(lhData[:,1] ,lhLabel), 3))) print("书中g(D, A3)为0.420,函数结果:" + str(round(infoGain(lhData[:,2] ,lhLabel), 3))) print("书中g(D, A4)为0.363,函数结果:" + str(round(infoGain(lhData[:,3] ,lhLabel), 3)))
|